#full_data <- readRDS('../data/full_data_20230930.rds')
hourly_full <- readRDS('../data/hourly_full_20230930.rds')
daily_full <- readRDS('../data/daily_full_20230930.rds')

Explore some summary statistics

daily_full <- daily_full %>%
  filter(!is.na(H2S_daily_avg))
hourly_full <- hourly_full %>%
  filter(!is.na(H2S_hourly_avg))
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  4850693 259.1    8355088 446.3         NA  7392967 394.9
## Vcells 19281450 147.2   37136747 283.4      24576 31973264 244.0
# full_data <- full_data %>%
#   filter(!is.na(H2S)) %>%
#   select(-starts_with('daily_'), -starts_with('H2S_daily'), 
#          -all_of(c('Ammonia', 'Benzene', 'Black Carbon', 'DST', 'utm_x', 'utm_y',
#                    'county')))
# gc()

Fixed Monitor stats

monitor_names <- c("ElSegundo" = "El Segundo", 
                   "StAnthony" = "St. Anthony",
                   "Manhattan" = "Manhattan",
                   "WestHS" = "West HS",
                   "ElmAve" = "Elm Ave",
                   "NorthHS" = "North HS",
                   "GuenserPark" = "Guenser Park",
                   "Chico" = "213th & Chico",
                   "Judson" = "Judson",
                   "HarborPark" = "Harbor Park",
                   "FirstMethodist" = "First Methodist",
                   "GStreet" = "G Street",
                   "StLuke" = "St. Luke",
                   "Hudson" = "Hudson",
                   "InnerPort" = "Inner Port")

base_monitor_stat <- daily_full %>%
  group_by(Monitor) %>%
  summarise('Start Date' = strftime(min(day), '%Y-%m-%d'),
            'End Date' = strftime(max(day), '%Y-%m-%d'),
            'Closest Refinery' = unique(closest_ref),
            'Distance to Nearest Refinery (m)' = round(unique(dist_ref)),
            'Angle to Refinery' = unique(angle_ref),
            'Distance to Nearest WRP (m)' = round(unique(dist_wrp)),
            'Capacity of Nearest WRP' = unique(closest_wrp_capacity),
            'Angle to WRP' = round(unique(angle_wrp)),
            'Distance to Dominguez Channel (m)' = round(unique(dist_dc)),
            'Elevation' = unique(elevation),
            'Enhanced Vegetation Index' = unique(EVI)) %>%
  mutate(`Closest Refinery` = case_when(`Closest Refinery` == "Phillips 66 (Wilmington)" ~ "Phillips 66",
                                        `Closest Refinery` == "Torrance Refinery" ~ "Torrance",
                                        `Closest Refinery` == "Valero Refinery" ~ "Valero",
                                        `Closest Refinery` == "Marathon (Carson)" ~ "Marathon Carson",
                                        `Closest Refinery` == "Marathon (Wilmington)" ~ "Marathon Wilmington",
                                        .default = `Closest Refinery`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(base_monitor_stat, digits = 2, format = 'html')
Monitor Start Date End Date Closest Refinery Distance to Nearest Refinery (m) Angle to Refinery Distance to Nearest WRP (m) Capacity of Nearest WRP Angle to WRP Distance to Dominguez Channel (m) Elevation Enhanced Vegetation Index
El Segundo 2020-01-01 2020-08-13 Chevron El Segundo 1174 190.87 2810 850 294 6437 60 0.17
St. Anthony 2020-04-23 2023-09-30 Chevron El Segundo 970 188.57 2825 850 298 6543 44 0.17
Manhattan 2020-03-24 2023-09-30 Chevron El Segundo 2341 341.46 5462 850 325 6145 42 0.19
West HS 2020-01-01 2022-04-29 Torrance 3536 85.69 9333 400 123 1547 36 0.15
Elm Ave 2020-01-01 2023-09-30 Torrance 1362 5.34 5966 400 132 3955 32 0.07
North HS 2020-01-01 2022-04-29 Torrance 1779 160.72 8569 400 145 4858 24 0.15
Guenser Park 2020-04-23 2023-09-30 Torrance 2400 220.25 7702 400 159 375 16 0.14
213th & Chico 2021-10-14 2022-01-28 Marathon Carson 2879 145.62 4297 400 213 50 7 0.12
Judson 2020-02-25 2023-09-30 Marathon Carson 2715 112.45 2692 400 213 1481 13 0.14
Harbor Park 2020-01-01 2023-09-30 Phillips 66 1463 183.71 2012 400 6 4262 12 0.60
First Methodist 2020-03-04 2023-09-30 Phillips 66 1124 205.55 2456 400 355 3792 14 0.21
G Street 2021-01-20 2023-09-30 Phillips 66 717 222.51 2940 400 356 3748 8 0.09
St. Luke 2020-02-18 2023-09-30 Marathon Carson 2768 260.38 6910 400 256 1790 10 0.17
Hudson 2020-01-01 2023-09-30 Marathon Wilmington 1378 240.55 5920 400 272 705 8 0.14
Inner Port 2020-04-22 2023-09-30 Valero 2022 260.92 5970 15 228 1937 5 0.04

Since Feb 2022

sincefeb2022_stat <- daily_full %>%
  filter(day > '2022-01-31') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 608 140.15 3.41 2.45 0.52 7.83 236.36 4.65 1.96
Manhattan 599 6.82 1.40 0.82 0.39 3.40 233.30 0.04 0.00
West HS 88 2.90 0.88 1.09 0.40 6.24 272.30 0.03 3.00
Elm Ave 608 5.10 3.10 1.60 0.89 4.53 243.43 0.04 2.00
North HS 89 4.00 1.61 1.37 0.51 5.36 240.86 0.00 0.00
Guenser Park 546 7.66 2.66 1.76 0.80 4.43 260.55 0.02 0.00
Judson 583 9.75 2.80 1.69 0.54 3.80 272.41 0.53 0.00
Harbor Park 414 9.65 2.48 1.85 0.50 2.89 296.16 0.07 44.30
First Methodist 606 14.72 2.34 2.33 0.71 3.27 269.13 0.11 29.22
G Street 607 39.18 2.72 3.82 0.83 4.96 277.89 0.11 19.36
St. Luke 602 11.62 3.43 2.25 0.74 3.66 291.26 0.23 5.00
Hudson 599 98.17 6.08 2.95 0.99 3.49 39.57 0.23 2.70
Inner Port 582 53.70 5.39 6.31 0.87 5.07 227.02 0.05 83.81

During Disaster (October 2021 - December 2021)

disaster_stat <- daily_full %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(disaster_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 92 20.21 2.41 3.01 0.69 7.07 343.87 2.96 3.00
Manhattan 92 25.45 4.06 3.48 1.51 2.74 212.03 0.11 0.00
West HS 92 41.50 4.87 3.50 0.74 3.95 239.76 0.53 3.00
Elm Ave 92 63.50 8.15 7.73 1.34 3.77 236.77 0.53 2.00
North HS 92 98.50 7.88 6.93 1.15 3.74 235.24 0.46 0.00
Guenser Park 92 211.67 14.48 14.64 1.63 3.51 276.58 0.46 0.00
213th & Chico 79 13407.18 1639.53 1025.43 128.07 3.55 293.63 17.15 0.00
Judson 92 742.25 69.08 59.79 6.86 2.77 294.11 31.71 0.00
Harbor Park 92 75.93 9.46 8.51 1.20 2.30 306.88 0.49 42.99
First Methodist 92 149.47 14.11 10.92 1.74 2.62 296.60 1.64 30.33
G Street 92 48.67 6.61 9.15 1.75 3.58 334.25 1.64 18.33
St. Luke 92 119.72 12.26 10.91 2.43 3.07 351.66 0.91 3.00
Hudson 92 192.64 19.45 16.72 3.36 2.53 13.51 0.91 2.34
Inner Port 90 136.50 12.21 15.91 3.18 3.47 9.29 0.09 86.34
# Try only the october for the prediction map
disaster_oct_stat <- daily_full %>%
  filter(year == '2021', month == '10') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(disaster_oct_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 31 20.21 2.41 3.89 0.73 7.49 304.48 6.39 3
Manhattan 31 25.45 4.06 5.47 1.77 2.91 225.92 0.29 0
West HS 31 41.50 4.87 7.56 1.06 4.35 231.91 1.42 3
Elm Ave 31 63.50 8.15 18.37 2.37 4.16 238.62 1.42 2
North HS 31 98.50 7.88 16.83 2.02 4.11 236.18 1.32 0
Guenser Park 31 211.67 14.48 37.77 3.12 3.88 263.18 1.32 0
213th & Chico 18 13407.18 1639.53 4337.41 536.21 4.41 260.39 49.44 0
Judson 31 742.25 69.08 167.25 17.18 2.91 290.90 79.10 0
Harbor Park 31 75.93 9.46 20.66 2.63 2.29 301.72 1.23 40
First Methodist 31 149.47 14.11 25.88 3.25 2.67 288.59 4.65 29
G Street 31 47.27 6.61 14.01 2.41 3.65 323.75 4.65 17
St. Luke 31 119.72 12.26 22.38 4.26 3.23 330.94 1.77 3
Hudson 31 192.64 19.45 39.27 5.95 2.76 343.90 1.77 2
Inner Port 31 136.50 12.21 22.60 3.79 3.87 325.23 0.26 87

Normal Period (Jan 2020- May 2023) excluding disaster

normal_stat <- daily_full %>%
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(normal_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
El Segundo 223 9.24 3.67 1.70 0.57 7.92 255.36 0.25 2.19
St. Anthony 1160 140.15 3.43 2.64 0.57 7.76 243.55 4.63 1.98
Manhattan 1180 6.82 2.38 1.03 0.53 4.02 241.77 0.06 0.00
West HS 755 18.50 3.01 1.54 0.60 5.40 247.07 0.04 1.91
Elm Ave 1268 11.40 3.20 1.87 0.92 4.53 242.45 0.04 1.86
North HS 758 30.60 2.71 1.91 1.02 4.62 241.06 0.01 0.00
Guenser Park 1073 16.18 2.66 1.50 0.62 4.55 259.26 0.02 0.00
213th & Chico 28 14.27 3.92 4.78 3.12 2.34 312.27 2.75 0.00
Judson 1186 15.78 3.63 1.99 0.72 4.41 266.20 0.43 0.00
Harbor Park 1018 17.42 2.67 1.76 0.47 3.37 283.50 0.07 46.82
First Methodist 1208 20.91 4.18 2.42 0.76 3.81 263.94 0.16 30.09
G Street 883 382.76 18.66 4.13 0.83 4.74 297.94 0.13 19.30
St. Luke 1177 11.62 3.43 2.09 0.69 3.94 270.65 0.16 3.86
Hudson 1266 98.17 6.08 2.87 0.98 3.73 254.80 0.15 3.15
Inner Port 1139 53.70 5.39 5.54 0.84 4.82 229.05 0.05 88.32
# Try only the Oct 2022
normal_oct_stat <- daily_full %>%
  filter(year == '2022' & month == '10') %>%
  group_by(Monitor) %>%
  summarise('Daily observations' = n(),
            'Max Daily Max' = max(H2S_daily_max, na.rm=T),
            'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
            'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
            'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
            'Average Wind Speed' = mean(ws_avg, na.rm=T),
            'Average Wind Direction' = as.numeric(mean(circular(wd_avg, 
                                                                units = 'degrees'), 
                                                       na.rm=T)),
            'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
            'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
  mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0, 
                                            `Average Wind Direction`+360, 
                                            `Average Wind Direction`)) %>%
  mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
  arrange(factor(Monitor, levels = unname(monitor_names)))

knitr::kable(normal_oct_stat, digits = 2, format = 'html')
Monitor Daily observations Max Daily Max Max Daily Average Avg Daily Max Avg Daily Average Average Wind Speed Average Wind Direction Average daily odor complaints within zipcode Active wells 2km
St. Anthony 31 4.24 1.54 2.12 0.98 6.65 259.53 14.13 3
Manhattan 23 1.95 0.89 0.96 0.46 2.68 224.60 0.00 0
Elm Ave 31 3.48 1.59 1.70 1.01 3.45 239.50 0.19 2
Guenser Park 31 6.50 2.02 2.59 1.23 3.14 266.04 0.00 0
Judson 31 2.97 0.63 0.77 0.25 2.59 284.61 1.16 0
Harbor Park 28 7.04 0.90 2.25 0.42 2.24 293.73 0.29 45
First Methodist 31 8.42 2.09 2.60 0.73 2.66 275.98 0.26 30
G Street 31 11.77 1.73 4.07 1.00 3.50 331.32 0.26 20
St. Luke 31 6.40 1.64 2.01 0.51 2.78 8.03 0.65 5
Hudson 29 13.34 2.12 3.18 1.20 2.71 57.46 0.62 3
Inner Port 31 52.63 2.07 6.95 0.80 3.02 161.12 0.00 80

Table 1: Monitor statistics

table1 <- base_monitor_stat %>%
  select(-c(`Angle to Refinery`, `Angle to WRP`, `Capacity of Nearest WRP`)) %>%
  left_join(disaster_stat %>% 
              select(Monitor, `Avg Daily Average`) %>% 
              rename(`Disaster Avg Daily Average` = `Avg Daily Average`), 
            join_by(Monitor)) %>%
  left_join(normal_stat %>% 
              select(Monitor, `Avg Daily Average`, `Average daily odor complaints within zipcode`) %>%
              rename(`Normal Avg Daily Average` = `Avg Daily Average`,
                     `Normal Avg Daily odor complaints` = `Average daily odor complaints within zipcode`), 
            join_by(Monitor)) %>%
  mutate(`Closest Refinery` = paste0(`Closest Refinery`, ' (', round(`Distance to Nearest Refinery (m)`/1000, 1), ')'),
         '#' = 1:n()) %>%
  select(-`Distance to Nearest Refinery (m)`) %>%
  relocate('#', Monitor, `Start Date`, `End Date`, `Closest Refinery`, `Normal Avg Daily Average`, `Disaster Avg Daily Average`, `Normal Avg Daily odor complaints`)

table1_kable <- knitr::kable(table1, format = 'latex', digits = 2)
writeLines(table1_kable, '../figures/table1.tex')

knitr::kable(table1, format = 'html', digits = 2)
# Monitor Start Date End Date Closest Refinery Normal Avg Daily Average Disaster Avg Daily Average Normal Avg Daily odor complaints Distance to Nearest WRP (m) Distance to Dominguez Channel (m) Elevation Enhanced Vegetation Index
1 El Segundo 2020-01-01 2020-08-13 Chevron El Segundo (1.2) 0.57 NA 0.25 2810 6437 60 0.17
2 St. Anthony 2020-04-23 2023-09-30 Chevron El Segundo (1) 0.57 0.69 4.63 2825 6543 44 0.17
3 Manhattan 2020-03-24 2023-09-30 Chevron El Segundo (2.3) 0.53 1.51 0.06 5462 6145 42 0.19
4 West HS 2020-01-01 2022-04-29 Torrance (3.5) 0.60 0.74 0.04 9333 1547 36 0.15
5 Elm Ave 2020-01-01 2023-09-30 Torrance (1.4) 0.92 1.34 0.04 5966 3955 32 0.07
6 North HS 2020-01-01 2022-04-29 Torrance (1.8) 1.02 1.15 0.01 8569 4858 24 0.15
7 Guenser Park 2020-04-23 2023-09-30 Torrance (2.4) 0.62 1.63 0.02 7702 375 16 0.14
8 213th & Chico 2021-10-14 2022-01-28 Marathon Carson (2.9) 3.12 128.07 2.75 4297 50 7 0.12
9 Judson 2020-02-25 2023-09-30 Marathon Carson (2.7) 0.72 6.86 0.43 2692 1481 13 0.14
10 Harbor Park 2020-01-01 2023-09-30 Phillips 66 (1.5) 0.47 1.20 0.07 2012 4262 12 0.60
11 First Methodist 2020-03-04 2023-09-30 Phillips 66 (1.1) 0.76 1.74 0.16 2456 3792 14 0.21
12 G Street 2021-01-20 2023-09-30 Phillips 66 (0.7) 0.83 1.75 0.13 2940 3748 8 0.09
13 St. Luke 2020-02-18 2023-09-30 Marathon Carson (2.8) 0.69 2.43 0.16 6910 1790 10 0.17
14 Hudson 2020-01-01 2023-09-30 Marathon Wilmington (1.4) 0.98 3.36 0.15 5920 705 8 0.14
15 Inner Port 2020-04-22 2023-09-30 Valero (2) 0.84 3.18 0.05 5970 1937 5 0.04

GAM

Feature Selection

Prepare feature and data tables

hourly_responses <- c('H2S_hourly_avg', 'H2S_hourly_max')

# since feb 2022
daily_data_sincefeb2022 <- daily_full %>% filter(day > '2022-01-31')
hourly_data_sincefeb2022 <- hourly_full %>% filter(day > '2022-01-31')

# Disaster
daily_data_dis <- daily_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
hourly_data_dis <- hourly_full %>% filter(year == '2021', month %in% c('10', '11', '12'))

# Exclude disaster stepwise
daily_data_excl_dis <- daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
hourly_data_excl_dis <- hourly_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))

# Everything w. disaster indicator
daily_data_dis_ind <- daily_full %>% 
  mutate(disaster = 
           if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
hourly_data_dis_ind <- hourly_full %>% 
  mutate(disaster = 
           if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))

Select smooth

daily_responses <- c('H2S_daily_avg', 'log(H2S_daily_avg)', 
                     'H2S_daily_max', 'log(H2S_daily_max)')
hourly_responses <- c('H2S_hourly_avg', 'log(H2S_hourly_avg)',
                      'H2S_hourly_max', 'log(H2S_hourly_max)')
dateranges <- c('sincefeb2022', 'dis', 'excl_dis', 'dis_ind', 'full')
smooth <- c("s(as.numeric(month),bs='cc')", 
            "s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
            "te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
smooth_tibble <- tibble(features = c(list(c(smooth[1])), 
                                     list(c(smooth[2])),
                                     list(c(smooth[3])),
                                     list(c(smooth[1:2])),
                                     list(c(smooth[c(1, 3)])),
                                     list(c(smooth[2:3])),
                                     list(c(smooth[1:3]))),
                        disaster_applicable = c(0, 1, 1, 0, 0, 1, 0))
# smooth_compare <- crossing(response = c(daily_responses, hourly_responses), 
#                            daterange = dateranges) %>%
#   cross_join(smooth_tibble) %>%
#   mutate(GCV = NA) %>%
#   filter(!(daterange == 'dis' & disaster_applicable == 0)) %>%
#   select(-disaster_applicable)
# 
# for (i in 1:nrow(smooth_compare)) {
#   features <- unname(unlist(smooth_compare[i,'features']))
#   formula_feature_str <- paste(features, collapse = ' + ')
#   formula_str <- paste(smooth_compare[[i, 'response']], formula_feature_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
# 
#   if (smooth_compare[[i, 'response']] %in% hourly_responses &
#       smooth_compare[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_compare[[i, 'response']] %in% hourly_responses){
#     data <- get(paste0('hourly_data_', smooth_compare[[i, 'daterange']]))
#   } else if (smooth_compare[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_compare[[i, 'daterange']]))
#   }
# 
#   summary <- summary(gam(formula, data = data, method = 'GCV.Cp', select = TRUE))
#   GCV_new <- summary$sp.criterion[[1]]
#   smooth_compare[i, 'GCV'] <- GCV_new
#   print(str_glue('Completed {i} iterations'))
#   gc()
# }
# 
# smooth_compare <- smooth_compare %>%
#   group_by(response, daterange) %>%
#   mutate(best = if_else(GCV == min(GCV), 1, 0)) %>%
#   mutate(rounded_GCV = round(GCV, 2)) %>%
#   rowwise() %>%
#   mutate(month_smooth = if_else(smooth[1] %in% unlist(features), 1, 0),
#          coord_smooth = if_else(smooth[2] %in% unlist(features), 1, 0),
#          coord_day_3D_smooth = if_else(smooth[3] %in% unlist(features), 1, 0)) %>%
#   ungroup()
# saveRDS(smooth_compare, 'smooth_compare.rds')

smooth_compare <- readRDS('smooth_compare.rds')
# get best smooth models for diff response and daterange
best_smooth <- smooth_compare %>%
  group_by(response, daterange) %>%
  filter(GCV == min(GCV)) %>%
  ungroup() %>%
  select(response, daterange, GCV, features, month_smooth, coord_smooth, coord_day_3D_smooth)
best_smooth
  • In general, having the coordinate smooth on top of the month and 3D smooth will not harm the model (except for daily max models since feb 2022).
  • However, it does not improve it by much either…
  • We will keep this in.

Fit smooth and get Residuals

response_names <- c('da', 'log_da', 'dm', 'log_dm', 'ha', 'log_ha', 'hm', 'log_hm')

# smooth_models <- tibble(name = response_names,
#                         response = c(daily_responses, hourly_responses)) %>%
#   crossing(tibble(daterange = dateranges)) %>%
#   mutate(name = paste(name, daterange, sep = '_'))


smooth_predictors <-
  c("s(as.numeric(month),bs='cc')",
    "s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
    "te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")

# smooth_models$residuals <- rep(list(c()), nrow(smooth_models))
# 
# for (i in 1:nrow(smooth_models)) {
#   # first, get residuals of smooth on response
#   if (smooth_models$daterange[i] == 'dis') {
#     formula_feature_str <- paste(smooth_predictors[c(2, 3)], collapse = ' + ')
#   } else {
#     formula_feature_str <- paste(smooth_predictors, collapse = ' + ')
#   }
# 
#   if (smooth_models[[i, 'response']] %in% hourly_responses &
#       smooth_models[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
#     data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
#   } else if (smooth_models[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
#   }
# 
#   formula_str <- paste(smooth_models[[i, 'response']], formula_feature_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
#   residuals <- gam(formula, data = data, method = 'GCV.Cp')$residuals
#   smooth_models$residuals[i] <- list(residuals)
# }
# saveRDS(smooth_models, 'smooth_models.rds')

smooth_models <- readRDS('smooth_models.rds')

Fit residuals to select linear features

disaster_predictors <- c('month', 'weekday', 'wd_avg', 'ws_avg', 
                       'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
                       'monthly_oil_2km', 'monthly_gas_2km', 'active_2km', 
                       'inactive_2km', 'elevation', 'EVI', 
                       'num_odor_complaints', 'closest_wrp_capacity')

everything_predictors <- c('year', 'weekday', 'wd_avg', 'ws_avg', 
                       'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
                       'monthly_oil_2km', 'monthly_gas_2km', 'active_2km', 
                       'inactive_2km', 'elevation', 'EVI', 
                       'num_odor_complaints', 'closest_wrp_capacity')

daily_predictors <- c('daily_downwind_ref', 'daily_downwind_wrp', 'daily_temp',
                      'daily_hum', 'daily_precip')

hourly_predictors <- c('hourly_downwind_ref', 'hourly_downwind_wrp', 'hourly_temp',
                      'hourly_hum', 'hourly_precip')

disaster_linear_pred_str <- paste(disaster_predictors, collapse = ' + ')
everything_linear_pred_str <- paste(everything_predictors, collapse = ' + ')
daily_linear_pred_str <- paste(daily_predictors, collapse = ' + ')
hourly_linear_pred_str <- paste(hourly_predictors, collapse = ' + ')

# for (i in 1:nrow(smooth_models)) {
#   if (smooth_models[[i, 'daterange']] == 'dis_ind' &
#       smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(everything_linear_pred_str,
#                          hourly_linear_pred_str,
#                          'disaster', sep = ' + ')
#   } else if (smooth_models[[i, 'daterange']] == 'dis' &
#              smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(disaster_linear_pred_str,
#                          hourly_linear_pred_str, sep = ' + ')
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
#     formula_str <- paste(everything_linear_pred_str,
#                          hourly_linear_pred_str, sep = ' + ')
#   } else if (smooth_models[[i, 'daterange']] == 'dis_ind') {
#     formula_str <- paste(everything_linear_pred_str,
#                          daily_linear_pred_str,
#                          'disaster', sep = ' + ')
#   } else if (smooth_models[[i, 'daterange']] == 'dis') {
#     formula_str <- paste(disaster_linear_pred_str,
#                          daily_linear_pred_str, sep = ' + ')
#   } else {
#     formula_str <- paste(everything_linear_pred_str,
#                          daily_linear_pred_str, sep = ' + ')
#   }
#   formula_str <- paste('residuals', formula_str, sep = ' ~ ')
#   formula <- as.formula(formula_str)
# 
#   if (smooth_models[[i, 'response']] %in% hourly_responses &
#       smooth_models[[i, 'daterange']] == 'full') {
#     data <- hourly_full
#   } else if (smooth_models[[i, 'response']] %in% hourly_responses){
#     data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
#   } else if (smooth_models[[i, 'daterange']] == 'full') {
#     data <- daily_full
#   } else {
#     data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
#   }
# 
#   data$residuals <- unlist(smooth_models$residuals[i])
#   regsubsets <- regsubsets(formula, data, nvmax = Inf)
#   assign(paste0(smooth_models$name[i], '_regsubsets'), regsubsets)
#   print(str_glue('Completed {i} rows'))
# }
# 
# for (i in 1:nrow(smooth_models)) {
#   saveRDS(get(paste0(smooth_models$name[i], '_regsubsets')),
#           paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds'))
# }

# read regsubsets
for (i in 1:nrow(smooth_models)) {
  assign(paste0(smooth_models$name[i], '_regsubsets'),
         readRDS(paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds')))
}

Get best model size for each model

# best_model_sizes <- smooth_models %>%
#   group_by(name, response, daterange) %>%
#   summarise(mean_smooth_res = mean(unlist(residuals)),
#             var_smooth_res = round(var(unlist(residuals))), 4) %>%
#   select(any_of(c('name', 'response', 'daterange',
#                   'mean_smooth_res', 'var_smooth_res'))) %>%
#   mutate(Adj.R2 = NA,
#          best_R2 = NA,
#          CP = NA,
#          best_CP = NA,
#          BIC = NA,
#          best_BIC = NA,
#          linear_features = NA)
# for (i in 1:nrow(smooth_models)) {
#   regsubset <- summary(get(paste0(smooth_models$name[i], '_regsubsets')))
#   best_sizes <- tibble(Adj.R2 = which.max(regsubset$adjr2),
#                        best_R2 = regsubset$adjr2[which.max(regsubset$adjr2)],
#                        CP = which.min(regsubset$cp),
#                        best_CP = regsubset$cp[which.max(regsubset$cp)],
#                        BIC = which.min(regsubset$bic),
#                        best_BIC = regsubset$bic[which.max(regsubset$bic)],)
#   best_features <- tibble(linear_features = list(setdiff(names(regsubset$which[best_sizes$CP, ]
#                                                    [unlist(regsubset$which[best_sizes$CP, ])]), '(Intercept)')))
#   best_model_sizes[i,] <- bind_cols(best_model_sizes[i, 1:5], best_sizes, best_features)
# }
# 
# best_models <- best_model_sizes %>%
#   rowwise() %>%
#   mutate(smooth_features = if_else(daterange == 'dis', list(smooth_predictors[2:3]),
#                                    list(smooth_predictors))) %>%
#   mutate(full_features = list(c(unlist(smooth_features), unlist(linear_features)))) %>%
#   ungroup()
# 
# saveRDS(best_models, 'best_gam_models.rds')

best_gam_models <- readRDS('best_gam_models.rds')

Save step tables

# for (model_table_name in model_table_names) {
#   model_table <- get(model_table_name)
#   write_csv(model_table, paste0('step_gam_tables/', model_table_name, '.csv'))
# }
# read model tables
# for (model_table_name in model_table_names) {
#   assign(model_table_name, 
#          read_csv(paste0('step_gam_tables/', model_table_name, '.csv')))
# }
# model_features_table <- tibble(model_name = character(),
#                                GCV = numeric(),
#                                p = numeric(),
#                                features = list())
# 
# # for each model, find the best set of predictors
# for (model in model_table_names) {
#   step_gam_table <- get(model)
#   best_model <- step_gam_table[nrow(step_gam_table), ]
#   best_model_features <-names(best_model)[as.logical(c(0, unname(unlist(c(best_model[1,-1])))))]
#   model_features_table <- rbind(model_features_table, 
#                                 tibble(model_name = model,
#                                        GCV = best_model$GCV,
#                                        p = length(best_model_features),
#                                        features = list(best_model_features)))
# }
# 
# meta <- expand.grid(stat, init, date)
# names(meta) <- c('stat', 'init', 'date')
# model_features_table <- cbind(meta, model_features_table)
# 
# saveRDS(model_features_table, 'step_gam_tables/model_features_table.rds')
# model_features_table <- readRDS('step_gam_tables/model_features_table.rds')

Final models

# write function that takes in response, predictors, data and returns gam model 
# different from stepwise function, this has select = FALSE
get_feature_vector <- function(response, daterange) {
  feature_vec <-  best_gam_models %>%
      filter(response == .env$response & daterange == .env$daterange) %>%
      pull(full_features) %>%
      unlist()
  feature_vec <- str_replace_all(feature_vec, 'month\\d+', 'month')
  feature_vec <- str_replace_all(feature_vec, 'year\\d+', 'month')
  feature_vec <- str_replace_all(feature_vec, 'weekday\\D+', 'weekday')
  feature_vec <- unique(feature_vec)
  return(feature_vec)
}

get_data <- function(response, daterange) {
  if (response %in% hourly_responses &
      daterange == 'full') {
    data <- hourly_full
  } else if (response %in% hourly_responses){
    data <- get(paste0('hourly_data_', daterange))
  } else if (daterange == 'full') {
    data <- daily_full
  } else {
    data <- get(paste0('daily_data_', daterange))
  }
  return(data)
}

get_gam_model <- function(response, daterange) {
  predictors <- get_feature_vector(response, daterange)
  formula_feature_str <- paste(predictors, collapse = ' + ')
  formula_str <- paste(response, formula_feature_str, sep = ' ~ ')
  formula <- as.formula(formula_str)

  data <- get_data(response, daterange)
  gam_model <- gam(formula, data = data, method = 'GCV.Cp')
  return(gam_model)
}

# for (i in 1:nrow(best_gam_models)) {
#   model <- get_gam_model(best_gam_models$response[i],
#                          best_gam_models$daterange[i])
#   assign(paste0(best_gam_models$name[i], '_', best_gam_models$daterange[i], '_gam'), model)
#   saveRDS(model, paste0('gam_models/', paste0(best_gam_models$name[i], '_gam.rds')))
# }
# # compare empty init vs full init and find best ones
# final_model_features_table <- model_features_table %>%
#   group_by(stat, date) %>%
#   filter(GCV == min(GCV))
# 
# final_model_features_table <- final_model_features_table %>%
#   select(-c(init, model_name)) %>%
#   distinct()
for (i in 1:nrow(best_gam_models)) {
  assign(paste0(best_gam_models$name[i], '_gam'), 
         readRDS(paste0('gam_models/', best_gam_models$name[i], '_gam.rds')))
}

Daily Average

Since February 2022

# Since feb 2022
summary(da_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_gas_2km + elevation + EVI + num_odor_complaints + 
##     daily_downwind_ref + daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.612e+00  2.532e+00   1.032   0.3023    
## month02             -1.239e-01  1.518e+00  -0.082   0.9349    
## month03             -9.521e-02  2.734e+00  -0.035   0.9722    
## month04             -8.055e-02  3.649e+00  -0.022   0.9824    
## month05             -1.351e-01  4.262e+00  -0.032   0.9747    
## month06             -2.392e-01  4.571e+00  -0.052   0.9583    
## month07             -3.464e-01  4.571e+00  -0.076   0.9396    
## month08             -3.837e-01  4.262e+00  -0.090   0.9283    
## month09             -4.015e-01  3.649e+00  -0.110   0.9124    
## month10             -4.329e-01  2.734e+00  -0.158   0.8742    
## month11             -2.729e-01  1.518e+00  -0.180   0.8573    
## month12             -1.949e-01  3.356e-02  -5.809 6.57e-09 ***
## weekdayMon           8.495e-02  1.462e-02   5.812 6.48e-09 ***
## weekdayTue           1.491e-01  1.459e-02  10.219  < 2e-16 ***
## weekdayWed           1.595e-01  1.463e-02  10.907  < 2e-16 ***
## weekdayThu           1.100e-01  1.464e-02   7.514 6.50e-14 ***
## weekdayFri           1.303e-01  1.460e-02   8.924  < 2e-16 ***
## weekdaySat           7.223e-02  1.459e-02   4.950 7.63e-07 ***
## wd_avg               2.842e-04  5.316e-05   5.346 9.29e-08 ***
## ws_avg              -7.080e-02  2.740e-03 -25.839  < 2e-16 ***
## I(1/dist_wrp^2)      8.393e-07  3.713e-07   2.260   0.0238 *  
## I(1/dist_ref^2)      1.526e-05  1.077e-05   1.418   0.1564    
## I(1/dist_dc^2)      -3.424e-04  1.078e-04  -3.175   0.0015 ** 
## monthly_gas_2km      4.028e-05  9.401e-06   4.285 1.85e-05 ***
## elevation           -3.865e-02  4.501e-03  -8.587  < 2e-16 ***
## EVI                 -1.347e+00  6.496e-02 -20.737  < 2e-16 ***
## num_odor_complaints  9.286e-03  1.876e-03   4.949 7.63e-07 ***
## daily_downwind_ref  -4.646e-03  1.619e-02  -0.287   0.7742    
## daily_temp           2.071e-03  1.428e-03   1.451   0.1469    
## daily_hum           -1.024e-02  4.033e-04 -25.399  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -2.983e-10  0.000   Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.985e+00  8.999 39.13
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.976e+01 80.000 44.58
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/127
## R-sq.(adj) =  0.619   Deviance explained = 62.5%
## GCV = 0.10007  Scale est. = 0.098295  n = 6531

Disaster Only

# Disaster only
summary(da_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + weekday + 
##     wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.242e+01  1.240e+01   2.615 0.009040 ** 
## weekdayMon          -6.532e+00  7.104e+00  -0.920 0.357985    
## weekdayTue          -1.077e+01  7.280e+00  -1.479 0.139444    
## weekdayWed           4.488e-01  7.088e+00   0.063 0.949527    
## weekdayThu          -3.571e-01  7.076e+00  -0.050 0.959766    
## weekdayFri           1.919e+00  6.917e+00   0.277 0.781522    
## weekdaySat           1.194e+00  7.040e+00   0.170 0.865381    
## wd_avg              -6.137e-02  2.086e-02  -2.941 0.003332 ** 
## ws_avg               6.722e+00  1.791e+00   3.753 0.000183 ***
## I(1/dist_wrp^2)     -1.034e-04  5.137e-05  -2.012 0.044470 *  
## I(1/dist_ref^2)      1.197e-03  6.008e-04   1.993 0.046481 *  
## I(1/dist_dc^2)       3.169e-01  1.510e-01   2.098 0.036104 *  
## monthly_oil_2km     -5.058e-03  1.771e-03  -2.857 0.004358 ** 
## inactive_2km         4.246e+00  1.757e+00   2.417 0.015818 *  
## num_odor_complaints -1.072e+00  1.618e-01  -6.624  5.3e-11 ***
## daily_hum           -2.316e-01  1.185e-01  -1.954 0.050880 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.313  8.838 3.189
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 67.903 80.000 8.117
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                    9e-04 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 102/105
## R-sq.(adj) =  0.417   Deviance explained = 45.7%
## GCV = 4760.8  Scale est. = 4427.1    n = 1273

Exclude Disaster

# Exclude disaster
summary(da_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.903e+00  2.123e+00   0.896 0.370002    
## month02             -3.852e-01  1.304e+00  -0.295 0.767767    
## month03             -4.816e-01  2.350e+00  -0.205 0.837619    
## month04             -5.176e-01  3.137e+00  -0.165 0.868951    
## month05             -5.757e-01  3.664e+00  -0.157 0.875163    
## month06             -6.525e-01  3.929e+00  -0.166 0.868114    
## month07             -5.844e-01  3.929e+00  -0.149 0.881776    
## month08             -4.606e-01  3.664e+00  -0.126 0.899978    
## month09             -4.104e-01  3.137e+00  -0.131 0.895914    
## month10             -4.651e-01  2.350e+00  -0.198 0.843101    
## month11             -1.895e-01  1.305e+00  -0.145 0.884495    
## month12             -2.310e-02  2.398e-02  -0.963 0.335418    
## weekdayMon           9.189e-02  1.362e-02   6.748 1.56e-11 ***
## weekdayTue           1.460e-01  1.357e-02  10.763  < 2e-16 ***
## weekdayWed           1.712e-01  1.356e-02  12.622  < 2e-16 ***
## weekdayThu           1.525e-01  1.357e-02  11.237  < 2e-16 ***
## weekdayFri           1.482e-01  1.357e-02  10.921  < 2e-16 ***
## weekdaySat           7.767e-02  1.358e-02   5.719 1.10e-08 ***
## ws_avg              -2.946e-02  1.915e-03 -15.386  < 2e-16 ***
## I(1/dist_wrp^2)      8.190e-07  3.352e-07   2.443 0.014565 *  
## I(1/dist_dc^2)       3.751e-04  2.586e-04   1.451 0.146851    
## monthly_oil_2km      5.451e-06  4.299e-06   1.268 0.204854    
## active_2km           6.908e-03  2.516e-03   2.746 0.006049 ** 
## inactive_2km        -1.944e-03  5.861e-03  -0.332 0.740172    
## elevation           -1.011e-02  2.775e-03  -3.644 0.000270 ***
## EVI                 -1.521e+00  9.630e-02 -15.795  < 2e-16 ***
## num_odor_complaints  4.900e-03  1.046e-03   4.682 2.87e-06 ***
## daily_downwind_ref  -3.168e-02  1.422e-02  -2.228 0.025913 *  
## daily_temp           4.433e-03  1.226e-03   3.617 0.000299 ***
## daily_hum           -8.872e-03  3.413e-04 -25.995  < 2e-16 ***
## daily_precip        -7.195e-02  2.590e-02  -2.778 0.005476 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -3.984e-10      2  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000e+00      9 41.33
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.959e+01     80 54.34
##                                                         p-value    
## s(as.numeric(month))                                       0.82    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 126/128
## R-sq.(adj) =  0.457   Deviance explained = 46.1%
## GCV = 0.18888  Scale est. = 0.18733   n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(da_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.287e+01  1.319e+02  -0.628   0.5299    
## month02              -5.163e-01  8.574e+01  -0.006   0.9952    
## month03              -8.525e-01  1.545e+02  -0.006   0.9956    
## month04              -1.410e+00  2.062e+02  -0.007   0.9945    
## month05              -1.750e+00  2.409e+02  -0.007   0.9942    
## month06              -1.383e+00  2.583e+02  -0.005   0.9957    
## month07              -1.344e+00  2.583e+02  -0.005   0.9958    
## month08              -9.400e-01  2.409e+02  -0.004   0.9969    
## month09              -1.929e-01  2.062e+02  -0.001   0.9993    
## month10               3.427e+00  1.545e+02   0.022   0.9823    
## month11              -3.611e+00  8.574e+01  -0.042   0.9664    
## month12              -2.900e+00  1.226e+00  -2.366   0.0180 *  
## wd_avg               -1.032e-02  2.385e-03  -4.330 1.50e-05 ***
## I(1/dist_wrp^2)      -2.146e-05  1.108e-05  -1.937   0.0528 .  
## I(1/dist_ref^2)      -4.456e-04  6.500e-04  -0.686   0.4930    
## I(1/dist_dc^2)        7.790e-02  4.998e-02   1.559   0.1191    
## monthly_oil_2km       1.170e-04  2.482e-04   0.472   0.6373    
## active_2km            6.352e-01  1.113e-01   5.705 1.19e-08 ***
## elevation            -6.513e-01  1.298e-01  -5.019 5.26e-07 ***
## EVI                  -2.944e+01  5.734e+00  -5.133 2.88e-07 ***
## num_odor_complaints   6.447e-01  3.070e-02  21.005  < 2e-16 ***
## closest_wrp_capacity  2.098e-01  3.392e-02   6.183 6.44e-10 ***
## daily_downwind_ref   -3.222e+00  7.296e-01  -4.416 1.01e-05 ***
## daily_downwind_wrp    9.430e-01  7.885e-01   1.196   0.2317    
## daily_hum            -6.342e-02  1.496e-02  -4.238 2.26e-05 ***
## disaster              8.224e+00  1.733e+00   4.745 2.10e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.086e-09  2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.862e+00  8.962 6.140
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.394e+01 80.000 8.194
##                                                          p-value    
## s(as.numeric(month))                                    2.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 120/123
## R-sq.(adj) =  0.114   Deviance explained =   12%
## GCV = 558.81  Scale est. = 555.02    n = 15595

Everything w.o Disaster Indicator

# Everything
summary(da_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.794e+01  1.320e+02  -0.590   0.5549    
## month02              -6.650e-01  8.580e+01  -0.008   0.9938    
## month03              -1.020e+00  1.546e+02  -0.007   0.9947    
## month04              -1.627e+00  2.063e+02  -0.008   0.9937    
## month05              -1.970e+00  2.410e+02  -0.008   0.9935    
## month06              -1.521e+00  2.585e+02  -0.006   0.9953    
## month07              -1.309e+00  2.585e+02  -0.005   0.9960    
## month08              -7.157e-01  2.410e+02  -0.003   0.9976    
## month09               1.062e-01  2.063e+02   0.001   0.9996    
## month10               6.623e+00  1.546e+02   0.043   0.9658    
## month11              -5.479e-01  8.580e+01  -0.006   0.9949    
## month12              -5.173e-03  1.064e+00  -0.005   0.9961    
## wd_avg               -1.039e-02  2.386e-03  -4.353 1.35e-05 ***
## I(1/dist_wrp^2)      -2.082e-05  1.117e-05  -1.864   0.0623 .  
## I(1/dist_ref^2)      -4.340e-04  6.515e-04  -0.666   0.5054    
## I(1/dist_dc^2)        7.571e-02  5.013e-02   1.510   0.1310    
## monthly_oil_2km       1.614e-04  2.484e-04   0.650   0.5157    
## active_2km            6.139e-01  1.114e-01   5.513 3.59e-08 ***
## elevation            -6.347e-01  1.298e-01  -4.888 1.03e-06 ***
## EVI                  -2.876e+01  5.740e+00  -5.010 5.51e-07 ***
## num_odor_complaints   6.535e-01  3.066e-02  21.314  < 2e-16 ***
## closest_wrp_capacity  2.115e-01  3.401e-02   6.219 5.12e-10 ***
## daily_downwind_ref   -3.309e+00  7.298e-01  -4.534 5.83e-06 ***
## daily_downwind_wrp    9.697e-01  7.890e-01   1.229   0.2191    
## daily_hum            -6.741e-02  1.495e-02  -4.509 6.57e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -1.314e-09  2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.875e+00  8.966 6.216
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.389e+01 80.000 7.919
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 119/122
## R-sq.(adj) =  0.112   Deviance explained = 11.8%
## GCV = 559.54  Scale est. = 555.78    n = 15595

Log Daily Average

Since February 2022

# Since feb 2022
summary(log_da_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     inactive_2km + elevation + EVI + num_odor_complaints + daily_temp + 
##     daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.621e+00  6.273e-01   2.584 0.009778 ** 
## month02             -1.138e-01  3.602e-01  -0.316 0.752104    
## month03             -6.208e-02  6.465e-01  -0.096 0.923512    
## month04              6.151e-03  8.620e-01   0.007 0.994307    
## month05             -4.032e-02  1.006e+00  -0.040 0.968027    
## month06             -1.212e-01  1.078e+00  -0.112 0.910507    
## month07             -2.156e-01  1.078e+00  -0.200 0.841512    
## month08             -2.532e-01  1.005e+00  -0.252 0.801092    
## month09             -3.077e-01  8.603e-01  -0.358 0.720634    
## month10             -5.063e-01  6.463e-01  -0.783 0.433374    
## month11             -3.737e-01  3.616e-01  -1.033 0.301503    
## month12             -2.853e-01  4.146e-02  -6.882 6.44e-12 ***
## weekdayMon           1.230e-01  1.810e-02   6.794 1.19e-11 ***
## weekdayTue           2.077e-01  1.807e-02  11.494  < 2e-16 ***
## weekdayWed           2.044e-01  1.811e-02  11.282  < 2e-16 ***
## weekdayThu           1.482e-01  1.813e-02   8.177 3.48e-16 ***
## weekdayFri           1.691e-01  1.808e-02   9.352  < 2e-16 ***
## weekdaySat           1.139e-01  1.807e-02   6.303 3.11e-10 ***
## wd_avg               3.931e-04  6.582e-05   5.972 2.46e-09 ***
## ws_avg              -1.032e-01  3.392e-03 -30.431  < 2e-16 ***
## I(1/dist_wrp^2)      4.383e-08  3.894e-07   0.113 0.910365    
## I(1/dist_ref^2)      1.114e-06  5.251e-06   0.212 0.831928    
## I(1/dist_dc^2)      -2.247e-04  6.819e-05  -3.296 0.000987 ***
## inactive_2km         1.684e-02  4.975e-03   3.385 0.000716 ***
## elevation           -4.086e-02  7.744e-03  -5.276 1.36e-07 ***
## EVI                 -1.907e+00  1.450e-01 -13.150  < 2e-16 ***
## num_odor_complaints  1.300e-02  2.323e-03   5.598 2.25e-08 ***
## daily_temp           2.309e-03  1.766e-03   1.308 0.190996    
## daily_hum           -1.301e-02  4.956e-04 -26.254  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -4.862e-12  3.000  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   7.659e+00  8.287 54.63
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.989e+01 80.000 77.10
##                                                         p-value    
## s(as.numeric(month))                                          1    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 123/126
## R-sq.(adj) =    0.7   Deviance explained = 70.5%
## GCV = 0.15343  Scale est. = 0.15076   n = 6531

Disaster Only

# Disaster only
summary(log_da_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_temp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.399e+01  5.713e+00  -2.449 0.014450 *  
## month11               5.086e-01  1.310e-01   3.882 0.000109 ***
## month12               4.853e-01  1.452e-01   3.343 0.000854 ***
## weekdayMon            4.504e-02  6.325e-02   0.712 0.476553    
## weekdayTue           -1.303e-01  6.478e-02  -2.012 0.044445 *  
## weekdayWed            1.073e-01  6.281e-02   1.708 0.087868 .  
## weekdayThu            7.894e-02  6.303e-02   1.252 0.210689    
## weekdayFri           -9.842e-02  6.144e-02  -1.602 0.109460    
## weekdaySat           -1.387e-01  6.204e-02  -2.236 0.025544 *  
## ws_avg               -1.428e-01  1.628e-02  -8.766  < 2e-16 ***
## I(1/dist_wrp^2)      -1.877e-05  7.598e-06  -2.470 0.013655 *  
## I(1/dist_ref^2)      -1.698e-04  7.029e-05  -2.415 0.015875 *  
## I(1/dist_dc^2)        1.556e-01  6.090e-02   2.556 0.010722 *  
## monthly_oil_2km       3.410e-04  1.225e-04   2.783 0.005466 ** 
## active_2km           -4.484e-01  4.093e-02 -10.956  < 2e-16 ***
## inactive_2km          9.736e-01  1.340e-01   7.263 6.83e-13 ***
## elevation            -2.777e-01  3.584e-02  -7.748 1.99e-14 ***
## num_odor_complaints   8.276e-03  1.340e-03   6.177 8.98e-10 ***
## closest_wrp_capacity  4.577e-02  1.095e-02   4.182 3.11e-05 ***
## daily_downwind_ref   -8.101e-02  5.618e-02  -1.442 0.149606    
## daily_temp            9.549e-03  6.866e-03   1.391 0.164564    
## daily_hum            -8.024e-03  1.338e-03  -5.997 2.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 25.693
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 58.55     80  9.163
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 108/111
## R-sq.(adj) =  0.746   Deviance explained = 76.3%
## GCV = 0.36938  Scale est. = 0.34427   n = 1273

Exclude Disaster

# Exclude disaster
summary(log_da_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + 
##     active_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     daily_downwind_ref + daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.639e-01  8.054e-01   0.203   0.8388    
## month02             -3.120e-01  2.630e+00  -0.119   0.9056    
## month03             -2.684e-01  3.316e+00  -0.081   0.9355    
## month04             -1.519e-01  2.721e+00  -0.056   0.9555    
## month05             -7.152e-02  2.205e+00  -0.032   0.9741    
## month06             -9.384e-02  2.404e+00  -0.039   0.9689    
## month07             -2.557e-02  2.390e+00  -0.011   0.9915    
## month08              6.363e-02  2.178e+00   0.029   0.9767    
## month09             -2.818e-02  2.708e+00  -0.010   0.9917    
## month10             -3.782e-01  3.312e+00  -0.114   0.9091    
## month11             -1.397e-01  2.628e+00  -0.053   0.9576    
## month12             -3.944e-02  2.658e-02  -1.484   0.1378    
## weekdayMon           1.204e-01  1.509e-02   7.984 1.53e-15 ***
## weekdayTue           1.830e-01  1.503e-02  12.175  < 2e-16 ***
## weekdayWed           2.079e-01  1.502e-02  13.835  < 2e-16 ***
## weekdayThu           1.912e-01  1.503e-02  12.721  < 2e-16 ***
## weekdayFri           1.900e-01  1.504e-02  12.632  < 2e-16 ***
## weekdaySat           1.055e-01  1.505e-02   7.009 2.51e-12 ***
## ws_avg              -4.004e-02  2.121e-03 -18.876  < 2e-16 ***
## I(1/dist_wrp^2)      2.021e-06  3.719e-07   5.435 5.58e-08 ***
## I(1/dist_ref^2)     -8.447e-05  1.742e-05  -4.850 1.25e-06 ***
## I(1/dist_dc^2)      -4.965e-04  2.597e-04  -1.912   0.0559 .  
## monthly_oil_2km     -8.034e-07  4.776e-06  -0.168   0.8664    
## active_2km           1.955e-02  2.804e-03   6.972 3.27e-12 ***
## inactive_2km        -6.689e-03  6.590e-03  -1.015   0.3101    
## elevation           -4.141e-03  3.076e-03  -1.346   0.1782    
## EVI                 -2.488e+00  1.069e-01 -23.269  < 2e-16 ***
## num_odor_complaints  4.609e-03  1.159e-03   3.976 7.06e-05 ***
## daily_downwind_ref   6.259e-03  1.576e-02   0.397   0.6912    
## daily_temp           7.312e-03  1.358e-03   5.386 7.33e-08 ***
## daily_hum           -1.082e-02  3.781e-04 -28.618  < 2e-16 ***
## daily_precip        -1.219e-01  2.869e-02  -4.246 2.19e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     1.354      5   0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 103.12
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.903     80  99.03
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/129
## R-sq.(adj) =  0.567   Deviance explained = 57.1%
## GCV = 0.2318  Scale est. = 0.2299    n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(log_da_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + ws_avg + 
##     I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip + disaster
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          8.546e-01  8.667e-01   0.986 0.324142    
## month02             -2.977e-01  2.700e+00  -0.110 0.912190    
## month03             -2.729e-01  3.401e+00  -0.080 0.936039    
## month04             -1.507e-01  2.780e+00  -0.054 0.956769    
## month05             -7.388e-02  2.245e+00  -0.033 0.973747    
## month06             -8.268e-02  2.464e+00  -0.034 0.973229    
## month07             -8.274e-03  2.461e+00  -0.003 0.997318    
## month08              9.803e-02  2.240e+00   0.044 0.965092    
## month09              1.840e-02  2.779e+00   0.007 0.994717    
## month10             -9.720e-02  3.401e+00  -0.029 0.977198    
## month11             -2.626e-01  2.700e+00  -0.097 0.922510    
## month12             -1.975e-01  2.854e-02  -6.921 4.67e-12 ***
## weekdayMon           1.076e-01  1.642e-02   6.551 5.90e-11 ***
## weekdayTue           1.493e-01  1.637e-02   9.122  < 2e-16 ***
## weekdayWed           1.984e-01  1.635e-02  12.129  < 2e-16 ***
## weekdayThu           1.793e-01  1.637e-02  10.951  < 2e-16 ***
## weekdayFri           1.622e-01  1.634e-02   9.923  < 2e-16 ***
## weekdaySat           8.083e-02  1.637e-02   4.937 8.03e-07 ***
## ws_avg              -4.208e-02  2.366e-03 -17.785  < 2e-16 ***
## I(1/dist_dc^2)       1.637e-03  2.719e-04   6.020 1.78e-09 ***
## monthly_oil_2km     -1.693e-05  4.941e-06  -3.427 0.000612 ***
## active_2km           1.825e-02  2.987e-03   6.111 1.01e-09 ***
## inactive_2km         7.144e-03  6.860e-03   1.041 0.297675    
## elevation           -2.010e-02  3.376e-03  -5.953 2.68e-09 ***
## EVI                 -2.494e+00  1.138e-01 -21.925  < 2e-16 ***
## num_odor_complaints  1.837e-02  7.114e-04  25.821  < 2e-16 ***
## daily_downwind_ref  -4.329e-02  1.681e-02  -2.576 0.010003 *  
## daily_temp          -3.715e-03  1.425e-03  -2.608 0.009120 ** 
## daily_hum           -1.142e-02  3.860e-04 -29.574  < 2e-16 ***
## daily_precip        -1.313e-01  2.726e-02  -4.816 1.48e-06 ***
## disaster             6.501e-01  4.012e-02  16.204  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     1.354  4.000  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.521  8.832 45.76
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.456 80.000 88.00
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 125/128
## R-sq.(adj) =  0.551   Deviance explained = 55.4%
## GCV = 0.29873  Scale est. = 0.29647   n = 15595

Everything w.o Disaster Indicator

# Everything
summary(log_da_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + ws_avg + 
##     I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.304e+00  8.735e-01   1.493  0.13538    
## month02             -2.819e-01  2.722e+00  -0.104  0.91753    
## month03             -2.357e-01  3.429e+00  -0.069  0.94522    
## month04             -1.066e-01  2.803e+00  -0.038  0.96967    
## month05             -3.566e-02  2.264e+00  -0.016  0.98743    
## month06             -5.742e-02  2.484e+00  -0.023  0.98156    
## month07              5.528e-03  2.481e+00   0.002  0.99822    
## month08              1.016e-01  2.259e+00   0.045  0.96412    
## month09              1.082e-02  2.802e+00   0.004  0.99692    
## month10              1.226e-01  3.429e+00   0.036  0.97149    
## month11             -4.326e-02  2.722e+00  -0.016  0.98732    
## month12              3.138e-02  2.502e-02   1.254  0.20978    
## weekdayMon           1.074e-01  1.656e-02   6.485 9.13e-11 ***
## weekdayTue           1.492e-01  1.651e-02   9.040  < 2e-16 ***
## weekdayWed           1.983e-01  1.649e-02  12.026  < 2e-16 ***
## weekdayThu           1.775e-01  1.651e-02  10.753  < 2e-16 ***
## weekdayFri           1.608e-01  1.648e-02   9.758  < 2e-16 ***
## weekdaySat           8.007e-02  1.651e-02   4.850 1.25e-06 ***
## ws_avg              -4.196e-02  2.386e-03 -17.584  < 2e-16 ***
## I(1/dist_dc^2)       1.396e-03  3.149e-04   4.432 9.40e-06 ***
## monthly_oil_2km     -1.530e-05  4.984e-06  -3.070  0.00215 ** 
## active_2km           1.544e-02  3.015e-03   5.120 3.09e-07 ***
## inactive_2km         1.108e-02  6.949e-03   1.594  0.11096    
## elevation           -1.903e-02  3.406e-03  -5.588 2.33e-08 ***
## EVI                 -2.425e+00  1.148e-01 -21.121  < 2e-16 ***
## num_odor_complaints  1.905e-02  7.161e-04  26.600  < 2e-16 ***
## daily_downwind_ref  -4.994e-02  1.694e-02  -2.948  0.00320 ** 
## daily_temp          -3.731e-03  1.436e-03  -2.597  0.00940 ** 
## daily_hum           -1.175e-02  3.887e-04 -30.220  < 2e-16 ***
## daily_precip        -1.255e-01  2.749e-02  -4.567 4.99e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     1.354  4.000  0.001
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.442  8.824 44.510
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.711 80.000 83.880
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/127
## R-sq.(adj) =  0.544   Deviance explained = 54.7%
## GCV = 0.30374  Scale est. = 0.30146   n = 15595

Daily Max

Since February 2022

# Since feb 2022
summary(dm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + monthly_oil_2km + 
##     active_2km + num_odor_complaints + daily_downwind_wrp + daily_temp + 
##     daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.417e+00  1.173e+00   2.914 0.003580 ** 
## month02             -5.408e-01  4.326e-01  -1.250 0.211292    
## month03             -6.875e-01  6.110e-01  -1.125 0.260519    
## month04             -1.207e+00  7.272e-01  -1.660 0.097032 .  
## month05             -1.321e+00  7.623e-01  -1.733 0.083133 .  
## month06             -1.608e+00  7.671e-01  -2.097 0.036063 *  
## month07             -1.761e+00  7.344e-01  -2.398 0.016534 *  
## month08             -1.645e+00  6.579e-01  -2.500 0.012439 *  
## month09             -1.875e+00  5.600e-01  -3.348 0.000820 ***
## month10             -1.707e+00  5.926e-01  -2.881 0.003980 ** 
## month11             -9.262e-01  5.106e-01  -1.814 0.069751 .  
## month12              1.781e-01  3.551e-01   0.502 0.616032    
## weekdayMon           5.553e-01  1.557e-01   3.567 0.000364 ***
## weekdayTue           9.128e-01  1.555e-01   5.872 4.52e-09 ***
## weekdayWed           8.820e-01  1.558e-01   5.663 1.55e-08 ***
## weekdayThu           7.505e-01  1.559e-01   4.814 1.52e-06 ***
## weekdayFri           8.598e-01  1.555e-01   5.528 3.36e-08 ***
## weekdaySat           3.445e-01  1.554e-01   2.216 0.026697 *  
## wd_avg               2.017e-03  5.651e-04   3.570 0.000360 ***
## ws_avg              -2.202e-01  3.005e-02  -7.326 2.65e-13 ***
## I(1/dist_wrp^2)     -2.092e-05  2.595e-06  -8.060 9.02e-16 ***
## I(1/dist_ref^2)      3.425e-04  4.276e-05   8.009 1.37e-15 ***
## monthly_oil_2km      9.761e-05  2.899e-05   3.368 0.000763 ***
## active_2km          -6.509e-02  1.055e-02  -6.172 7.17e-10 ***
## num_odor_complaints  1.365e-01  1.941e-02   7.035 2.20e-12 ***
## daily_downwind_wrp   4.426e-01  1.850e-01   2.392 0.016778 *  
## daily_temp           4.927e-02  1.537e-02   3.205 0.001358 ** 
## daily_hum           -3.440e-02  4.376e-03  -7.861 4.43e-15 ***
## daily_precip         4.083e-01  2.438e-01   1.675 0.093988 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df      F
## s(as.numeric(month))                                    5.711e-14  1.000  0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.840e+00  8.944 14.696
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 4.944e+01 80.000  1.542
##                                                         p-value    
## s(as.numeric(month))                                        0.5    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 124/126
## R-sq.(adj) =  0.209   Deviance explained = 21.9%
## GCV = 11.299  Scale est. = 11.151    n = 6531

Disaster Only

# Disaster only
summary(dm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.849e+02  9.013e+01   3.161  0.00161 ** 
## wd_avg              -5.063e-01  1.611e-01  -3.143  0.00172 ** 
## ws_avg               4.253e+01  1.327e+01   3.206  0.00138 ** 
## I(1/dist_wrp^2)     -9.204e-04  4.076e-04  -2.258  0.02413 *  
## I(1/dist_ref^2)      1.050e-02  4.707e-03   2.231  0.02589 *  
## I(1/dist_dc^2)       2.830e+00  1.205e+00   2.350  0.01895 *  
## monthly_oil_2km     -3.934e-02  1.377e-02  -2.858  0.00434 ** 
## inactive_2km         3.257e+01  1.364e+01   2.388  0.01708 *  
## num_odor_complaints -8.312e+00  1.257e+00  -6.614 5.65e-11 ***
## daily_hum           -1.896e+00  8.990e-01  -2.109  0.03519 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.359  8.855 3.374
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.202 80.000 8.513
##                                                          p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  0.000459 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 96/99
## R-sq.(adj) =  0.432   Deviance explained = 46.9%
## GCV = 2.8585e+05  Scale est. = 2.6709e+05  n = 1273

Exclude Disaster

# Exclude disaster
summary(dm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + EVI + num_odor_complaints + 
##     daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4.032e+00  1.477e+00   2.730  0.00634 ** 
## month02             -5.824e-01  7.727e-01  -0.754  0.45098    
## month03             -6.730e-01  1.368e+00  -0.492  0.62277    
## month04             -7.405e-01  1.823e+00  -0.406  0.68454    
## month05             -1.445e+00  2.128e+00  -0.679  0.49712    
## month06             -1.849e+00  2.283e+00  -0.810  0.41811    
## month07             -1.727e+00  2.285e+00  -0.756  0.44978    
## month08             -1.496e+00  2.134e+00  -0.701  0.48333    
## month09             -1.402e+00  1.832e+00  -0.765  0.44405    
## month10             -1.417e+00  1.388e+00  -1.021  0.30729    
## month11             -6.786e-01  7.943e-01  -0.854  0.39292    
## month12             -1.517e-01  2.287e-01  -0.663  0.50709    
## weekdayMon           5.627e-01  1.318e-01   4.269 1.97e-05 ***
## weekdayTue           9.583e-01  1.313e-01   7.301 3.01e-13 ***
## weekdayWed           8.572e-01  1.312e-01   6.532 6.70e-11 ***
## weekdayThu           7.778e-01  1.313e-01   5.925 3.20e-09 ***
## weekdayFri           7.427e-01  1.313e-01   5.656 1.58e-08 ***
## weekdaySat           3.145e-01  1.314e-01   2.393  0.01672 *  
## wd_avg               1.193e-03  4.528e-04   2.635  0.00843 ** 
## ws_avg              -1.040e-01  1.795e-02  -5.797 6.91e-09 ***
## I(1/dist_wrp^2)      3.984e-06  6.889e-07   5.783 7.50e-09 ***
## I(1/dist_ref^2)      2.288e-05  2.695e-06   8.492  < 2e-16 ***
## EVI                 -3.015e+00  3.761e-01  -8.015 1.18e-15 ***
## num_odor_complaints  7.792e-02  1.001e-02   7.787 7.35e-15 ***
## daily_temp           2.929e-02  1.183e-02   2.476  0.01329 *  
## daily_hum           -3.160e-02  3.260e-03  -9.695  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df      F
## s(as.numeric(month))                                    2.367e-12      2  0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  9.000e+00      9 13.289
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 6.218e+01     80  4.227
##                                                         p-value    
## s(as.numeric(month))                                          1    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.123   Deviance explained = 12.9%
## GCV = 17.654  Scale est. = 17.536    n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(dm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.825e+02  1.032e+03  -0.661 0.508591    
## month02              -1.581e+00  6.710e+02  -0.002 0.998120    
## month03              -4.092e+00  1.209e+03  -0.003 0.997299    
## month04              -8.671e+00  1.614e+03  -0.005 0.995713    
## month05              -1.180e+01  1.885e+03  -0.006 0.995005    
## month06              -8.519e+00  2.021e+03  -0.004 0.996637    
## month07              -8.552e+00  2.021e+03  -0.004 0.996624    
## month08              -5.798e+00  1.885e+03  -0.003 0.997546    
## month09               2.304e-01  1.614e+03   0.000 0.999886    
## month10               3.085e+01  1.209e+03   0.026 0.979641    
## month11              -2.810e+01  6.710e+02  -0.042 0.966598    
## month12              -2.317e+01  9.594e+00  -2.415 0.015760 *  
## wd_avg               -8.180e-02  1.866e-02  -4.383 1.18e-05 ***
## I(1/dist_wrp^2)      -1.880e-04  9.013e-05  -2.086 0.037013 *  
## I(1/dist_ref^2)      -3.224e-03  5.131e-03  -0.628 0.529805    
## I(1/dist_dc^2)        6.701e-01  4.085e-01   1.641 0.100914    
## monthly_oil_2km       1.015e-03  1.958e-03   0.518 0.604164    
## active_2km            5.052e+00  8.734e-01   5.785 7.40e-09 ***
## elevation            -5.216e+00  1.018e+00  -5.125 3.02e-07 ***
## EVI                  -2.285e+02  4.502e+01  -5.075 3.93e-07 ***
## num_odor_complaints   5.415e+00  2.403e-01  22.536  < 2e-16 ***
## closest_wrp_capacity  1.695e+00  2.683e-01   6.319 2.71e-10 ***
## daily_downwind_ref   -2.562e+01  5.710e+00  -4.488 7.25e-06 ***
## daily_downwind_wrp    8.202e+00  6.171e+00   1.329 0.183811    
## daily_hum            -4.493e-01  1.171e-01  -3.837 0.000125 ***
## disaster              6.676e+01  1.356e+01   4.922 8.68e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.192e-09  2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.895e+00  8.972 6.560
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.435e+01 80.000 8.876
##                                                          p-value    
## s(as.numeric(month))                                    5.74e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 120/123
## R-sq.(adj) =  0.124   Deviance explained = 12.9%
## GCV =  34221  Scale est. = 33988     n = 15595

Everything w.o Disaster Indicator

# Everything
summary(dm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_downwind_wrp + daily_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.413e+02  1.033e+03  -0.621   0.5348    
## month02              -3.419e+00  6.714e+02  -0.005   0.9959    
## month03              -6.472e+00  1.210e+03  -0.005   0.9957    
## month04              -1.131e+01  1.615e+03  -0.007   0.9944    
## month05              -1.446e+01  1.886e+03  -0.008   0.9939    
## month06              -1.036e+01  2.023e+03  -0.005   0.9959    
## month07              -8.594e+00  2.023e+03  -0.004   0.9966    
## month08              -4.102e+00  1.886e+03  -0.002   0.9983    
## month09               3.124e+00  1.615e+03   0.002   0.9985    
## month10               5.769e+01  1.210e+03   0.048   0.9620    
## month11              -2.498e+00  6.714e+02  -0.004   0.9970    
## month12               8.355e-01  8.333e+00   0.100   0.9201    
## wd_avg               -8.470e-02  1.873e-02  -4.522 6.16e-06 ***
## ws_avg                1.233e+00  7.795e-01   1.582   0.1136    
## I(1/dist_wrp^2)      -1.716e-04  8.833e-05  -1.943   0.0521 .  
## I(1/dist_ref^2)      -3.335e-03  5.091e-03  -0.655   0.5125    
## I(1/dist_dc^2)        6.105e-01  3.967e-01   1.539   0.1239    
## monthly_oil_2km       1.254e-03  1.948e-03   0.643   0.5200    
## active_2km            4.873e+00  8.721e-01   5.588 2.34e-08 ***
## elevation            -4.980e+00  1.019e+00  -4.889 1.02e-06 ***
## EVI                  -2.185e+02  4.500e+01  -4.855 1.22e-06 ***
## num_odor_complaints   5.488e+00  2.400e-01  22.869  < 2e-16 ***
## closest_wrp_capacity  1.690e+00  2.669e-01   6.334 2.45e-10 ***
## daily_downwind_ref   -2.643e+01  5.712e+00  -4.627 3.74e-06 ***
## daily_downwind_wrp    8.226e+00  6.176e+00   1.332   0.1829    
## daily_hum            -4.765e-01  1.170e-01  -4.072 4.69e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.775e-10  3.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.881e+00  8.968 6.677
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.403e+01 80.000 8.547
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 120/123
## R-sq.(adj) =  0.122   Deviance explained = 12.8%
## GCV =  34269  Scale est. = 34036     n = 15595

Log Daily Max

Since February 2022

# Since feb 2022
summary(log_dm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km + 
##     elevation + EVI + num_odor_complaints + daily_temp + daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.337e+00  7.403e+00   0.316 0.752263    
## month02             -4.496e-02  2.231e+01  -0.002 0.998392    
## month03             -8.551e-03  2.810e+01   0.000 0.999757    
## month04              8.563e-02  2.306e+01   0.004 0.997038    
## month05              2.317e-01  1.879e+01   0.012 0.990161    
## month06              2.818e-01  2.052e+01   0.014 0.989045    
## month07              2.890e-01  2.034e+01   0.014 0.988662    
## month08              2.375e-01  1.847e+01   0.013 0.989741    
## month09              3.436e-02  2.301e+01   0.001 0.998808    
## month10             -3.449e-01  2.815e+01  -0.012 0.990225    
## month11             -2.837e-01  2.233e+01  -0.013 0.989866    
## month12             -1.854e-01  6.654e-02  -2.787 0.005343 ** 
## weekdayMon           2.598e-01  2.901e-02   8.955  < 2e-16 ***
## weekdayTue           3.497e-01  2.897e-02  12.073  < 2e-16 ***
## weekdayWed           3.293e-01  2.903e-02  11.341  < 2e-16 ***
## weekdayThu           2.469e-01  2.906e-02   8.496  < 2e-16 ***
## weekdayFri           2.610e-01  2.897e-02   9.008  < 2e-16 ***
## weekdaySat           1.755e-01  2.897e-02   6.060 1.44e-09 ***
## wd_avg               3.930e-04  1.055e-04   3.725 0.000197 ***
## ws_avg              -1.300e-01  5.435e-03 -23.925  < 2e-16 ***
## I(1/dist_ref^2)      4.217e-05  3.018e-05   1.397 0.162377    
## I(1/dist_dc^2)      -1.807e-04  1.881e-04  -0.961 0.336623    
## monthly_gas_2km      2.978e-05  1.857e-05   1.603 0.108887    
## elevation           -5.687e-02  8.935e-03  -6.365 2.08e-10 ***
## EVI                 -1.413e+00  1.276e-01 -11.074  < 2e-16 ***
## num_odor_complaints  2.857e-02  3.723e-03   7.674 1.91e-14 ***
## daily_temp           9.838e-03  2.830e-03   3.476 0.000512 ***
## daily_hum           -1.386e-02  7.946e-04 -17.450  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     1.353      2  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 25.95
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.051     80 18.10
##                                                          p-value    
## s(as.numeric(month))                                    0.000698 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/125
## R-sq.(adj) =  0.544   Deviance explained = 55.2%
## GCV = 0.39419  Scale est. = 0.38731   n = 6531

Disaster Only

# Disaster only
summary(log_dm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + 
##     inactive_2km + elevation + num_odor_complaints + daily_temp + 
##     daily_hum
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.788e+00  8.089e-01   8.391  < 2e-16 ***
## month11             -1.956e-01  1.658e-01  -1.180 0.238194    
## month12             -4.188e-01  1.844e-01  -2.271 0.023313 *  
## weekdayMon           1.455e-01  8.201e-02   1.775 0.076195 .  
## weekdayTue          -8.291e-02  8.401e-02  -0.987 0.323859    
## weekdayWed           4.878e-02  8.157e-02   0.598 0.549949    
## weekdayThu           1.501e-01  8.180e-02   1.835 0.066792 .  
## weekdayFri          -2.089e-01  7.977e-02  -2.619 0.008934 ** 
## weekdaySat          -1.335e-01  8.061e-02  -1.657 0.097819 .  
## wd_avg               2.471e-04  2.384e-04   1.037 0.300113    
## ws_avg              -1.441e-01  2.116e-02  -6.809 1.56e-11 ***
## I(1/dist_wrp^2)     -3.130e-06  7.624e-07  -4.105 4.31e-05 ***
## I(1/dist_ref^2)      5.212e-05  7.485e-06   6.964 5.48e-12 ***
## I(1/dist_dc^2)       1.662e-02  2.265e-03   7.340 3.96e-13 ***
## monthly_oil_2km      4.527e-04  6.189e-05   7.314 4.74e-13 ***
## monthly_gas_2km     -2.848e-03  3.507e-04  -8.119 1.17e-15 ***
## inactive_2km         8.977e-02  2.344e-02   3.830 0.000135 ***
## elevation           -2.525e-01  2.570e-02  -9.826  < 2e-16 ***
## num_odor_complaints  8.459e-03  1.720e-03   4.919 9.90e-07 ***
## daily_temp           7.311e-03  8.888e-03   0.823 0.410953    
## daily_hum           -6.435e-03  1.733e-03  -3.712 0.000215 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.938  8.991 15.488
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 56.931 80.000  7.708
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 107/110
## R-sq.(adj) =  0.725   Deviance explained = 74.3%
## GCV =  0.621  Scale est. = 0.58009   n = 1273

Exclude Disaster

# Exclude disaster
summary(log_dm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_ref^2) + monthly_gas_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_wrp + 
##     daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.360e+00  4.563e+00   0.298  0.76561    
## month02             -2.655e-01  2.805e+00  -0.095  0.92462    
## month03             -3.708e-01  5.054e+00  -0.073  0.94152    
## month04             -4.552e-01  6.747e+00  -0.067  0.94621    
## month05             -5.671e-01  7.881e+00  -0.072  0.94264    
## month06             -7.032e-01  8.451e+00  -0.083  0.93369    
## month07             -6.054e-01  8.452e+00  -0.072  0.94290    
## month08             -4.295e-01  7.881e+00  -0.054  0.95654    
## month09             -3.728e-01  6.747e+00  -0.055  0.95593    
## month10             -5.761e-01  5.054e+00  -0.114  0.90925    
## month11             -2.056e-01  2.806e+00  -0.073  0.94159    
## month12             -7.229e-02  3.654e-02  -1.978  0.04793 *  
## weekdayMon           2.614e-01  2.074e-02  12.599  < 2e-16 ***
## weekdayTue           3.066e-01  2.066e-02  14.842  < 2e-16 ***
## weekdayWed           3.344e-01  2.065e-02  16.196  < 2e-16 ***
## weekdayThu           2.971e-01  2.066e-02  14.378  < 2e-16 ***
## weekdayFri           2.896e-01  2.066e-02  14.014  < 2e-16 ***
## weekdaySat           1.537e-01  2.068e-02   7.433 1.12e-13 ***
## wd_avg               1.886e-04  7.179e-05   2.628  0.00860 ** 
## ws_avg              -5.397e-02  2.928e-03 -18.434  < 2e-16 ***
## I(1/dist_ref^2)      1.677e-05  1.440e-05   1.164  0.24431    
## monthly_gas_2km      5.280e-06  1.517e-05   0.348  0.72775    
## active_2km           2.310e-02  3.522e-03   6.558 5.63e-11 ***
## inactive_2km        -2.618e-02  8.261e-03  -3.169  0.00153 ** 
## elevation           -1.969e-02  4.091e-03  -4.814 1.50e-06 ***
## EVI                 -2.136e+00  1.228e-01 -17.390  < 2e-16 ***
## num_odor_complaints  1.234e-02  1.591e-03   7.754 9.51e-15 ***
## daily_downwind_wrp   4.891e-02  2.298e-02   2.128  0.03334 *  
## daily_temp           1.481e-02  1.873e-03   7.905 2.86e-15 ***
## daily_hum           -1.259e-02  5.221e-04 -24.114  < 2e-16 ***
## daily_precip        -1.038e-01  3.950e-02  -2.628  0.00860 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    2.474e-10      1  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  9.000e+00      9 34.32
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.909e+01     80 42.29
##                                                         p-value    
## s(as.numeric(month))                                        0.5    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 127/128
## R-sq.(adj) =  0.481   Deviance explained = 48.5%
## GCV = 0.43774  Scale est. = 0.43413   n = 14322

Everything w Disaster Indicator

# Disaster indicator
summary(log_dm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + ws_avg + 
##     I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + daily_downwind_ref + 
##     daily_temp + daily_hum + daily_precip + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.689e+00  3.416e-01  10.802  < 2e-16 ***
## month02              -3.025e-01  1.595e-01  -1.897  0.05782 .  
## month03              -5.292e-01  2.840e-01  -1.863  0.06244 .  
## month04              -6.785e-01  3.787e-01  -1.792  0.07318 .  
## month05              -8.450e-01  4.419e-01  -1.912  0.05589 .  
## month06              -9.548e-01  4.735e-01  -2.016  0.04379 *  
## month07              -7.869e-01  4.732e-01  -1.663  0.09631 .  
## month08              -5.000e-01  4.413e-01  -1.133  0.25717    
## month09              -3.404e-01  3.783e-01  -0.900  0.36824    
## month10              -1.182e-01  2.853e-01  -0.414  0.67881    
## month11              -3.480e-01  1.618e-01  -2.151  0.03146 *  
## month12              -2.763e-01  3.767e-02  -7.333 2.36e-13 ***
## weekdayMon            2.426e-01  2.171e-02  11.177  < 2e-16 ***
## weekdayTue            2.676e-01  2.164e-02  12.369  < 2e-16 ***
## weekdayWed            3.125e-01  2.162e-02  14.452  < 2e-16 ***
## weekdayThu            2.805e-01  2.164e-02  12.960  < 2e-16 ***
## weekdayFri            2.422e-01  2.160e-02  11.209  < 2e-16 ***
## weekdaySat            1.217e-01  2.165e-02   5.622 1.92e-08 ***
## ws_avg               -5.479e-02  3.124e-03 -17.537  < 2e-16 ***
## I(1/dist_wrp^2)      -4.816e-01  2.939e-02 -16.388  < 2e-16 ***
## I(1/dist_ref^2)      -1.948e-03  1.209e-04 -16.110  < 2e-16 ***
## I(1/dist_dc^2)        4.262e+03  2.600e+02  16.388  < 2e-16 ***
## elevation            -2.551e-02  3.838e-03  -6.646 3.11e-11 ***
## EVI                  -1.392e+00  7.331e-02 -18.995  < 2e-16 ***
## num_odor_complaints   2.453e-02  9.400e-04  26.092  < 2e-16 ***
## closest_wrp_capacity -3.532e-03  3.517e-04 -10.044  < 2e-16 ***
## daily_downwind_ref    1.927e-02  2.231e-02   0.864  0.38773    
## daily_temp           -6.874e-04  1.882e-03  -0.365  0.71492    
## daily_hum            -1.322e-02  5.104e-04 -25.913  < 2e-16 ***
## daily_precip         -9.862e-02  3.603e-02  -2.737  0.00621 ** 
## disaster              9.166e-01  5.299e-02  17.299  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -2.453e-12  1.000  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.524e+00  8.844 32.95
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.876e+01 80.000 41.70
##                                                         p-value    
## s(as.numeric(month))                                        0.5    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 126/128
## R-sq.(adj) =  0.503   Deviance explained = 50.7%
## GCV = 0.52205  Scale est. = 0.51816   n = 15595

Everything w.o Disaster Indicator

# Everything
summary(log_dm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     daily_downwind_ref + daily_temp + daily_hum + daily_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.256e+00  7.479e-01   5.691 1.29e-08 ***
## month02              -3.219e-01  4.639e-01  -0.694   0.4877    
## month03              -5.495e-01  8.346e-01  -0.658   0.5103    
## month04              -7.015e-01  1.114e+00  -0.630   0.5289    
## month05              -8.651e-01  1.301e+00  -0.665   0.5061    
## month06              -9.659e-01  1.395e+00  -0.692   0.4887    
## month07              -7.788e-01  1.395e+00  -0.558   0.5766    
## month08              -4.726e-01  1.301e+00  -0.363   0.7164    
## month09              -3.053e-01  1.114e+00  -0.274   0.7840    
## month10               2.407e-01  8.347e-01   0.288   0.7731    
## month11              -4.731e-03  4.642e-01  -0.010   0.9919    
## month12               4.651e-02  3.300e-02   1.410   0.1587    
## weekdayMon            2.431e-01  2.192e-02  11.091  < 2e-16 ***
## weekdayTue            2.674e-01  2.185e-02  12.242  < 2e-16 ***
## weekdayWed            3.125e-01  2.183e-02  14.315  < 2e-16 ***
## weekdayThu            2.780e-01  2.185e-02  12.722  < 2e-16 ***
## weekdayFri            2.403e-01  2.181e-02  11.020  < 2e-16 ***
## weekdaySat            1.204e-01  2.185e-02   5.511 3.62e-08 ***
## wd_avg                9.950e-05  7.450e-05   1.336   0.1817    
## ws_avg               -5.504e-02  3.168e-03 -17.378  < 2e-16 ***
## I(1/dist_wrp^2)      -4.959e-01  2.969e-02 -16.701  < 2e-16 ***
## I(1/dist_ref^2)      -1.930e-02  1.158e-03 -16.670  < 2e-16 ***
## I(1/dist_dc^2)        4.381e+03  2.623e+02  16.701  < 2e-16 ***
## elevation            -2.365e-02  3.871e-03  -6.110 1.02e-09 ***
## EVI                  -1.387e+00  7.389e-02 -18.778  < 2e-16 ***
## num_odor_complaints   2.549e-02  9.471e-04  26.910  < 2e-16 ***
## closest_wrp_capacity -3.447e-03  3.536e-04  -9.749  < 2e-16 ***
## daily_downwind_ref    9.237e-03  2.253e-02   0.410   0.6819    
## daily_temp           -5.642e-04  1.907e-03  -0.296   0.7673    
## daily_hum            -1.363e-02  5.172e-04 -26.357  < 2e-16 ***
## daily_precip         -8.552e-02  3.653e-02  -2.341   0.0192 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -2.863e-11  2.000  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.394e+00  8.791 32.50
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.867e+01 80.000 37.32
##                                                         p-value    
## s(as.numeric(month))                                          1    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 126/128
## R-sq.(adj) =  0.494   Deviance explained = 49.7%
## GCV = 0.53207  Scale est. = 0.52811   n = 15595

Hourly Avg

Since February 2022

# Since feb 2022
summary(ha_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.428e+00  2.291e-01  23.689  < 2e-16 ***
## month02               7.579e-02  1.071e+00   0.071 0.943578    
## month03               3.688e-01  1.684e+00   0.219 0.826667    
## month04               6.329e-01  1.755e+00   0.361 0.718297    
## month05               6.251e-01  1.339e+00   0.467 0.640672    
## month06               5.330e-01  6.819e-01   0.782 0.434435    
## month07               3.795e-01  7.165e-01   0.530 0.596307    
## month08               2.974e-01  1.381e+00   0.215 0.829538    
## month09               3.008e-01  1.786e+00   0.168 0.866287    
## month10               1.082e-01  1.702e+00   0.064 0.949302    
## month11              -9.159e-02  1.076e+00  -0.085 0.932159    
## month12              -1.805e-01  1.268e-02 -14.231  < 2e-16 ***
## weekdayMon            7.595e-02  5.570e-03  13.635  < 2e-16 ***
## weekdayTue            1.404e-01  5.521e-03  25.429  < 2e-16 ***
## weekdayWed            1.480e-01  5.534e-03  26.752  < 2e-16 ***
## weekdayThu            1.075e-01  5.537e-03  19.412  < 2e-16 ***
## weekdayFri            1.252e-01  5.517e-03  22.701  < 2e-16 ***
## weekdaySat            6.518e-02  5.502e-03  11.847  < 2e-16 ***
## wd_avg               -2.063e-04  1.769e-05 -11.657  < 2e-16 ***
## ws_avg               -5.071e-02  6.893e-04 -73.560  < 2e-16 ***
## I(1/dist_wrp^2)       1.093e-06  6.567e-08  16.640  < 2e-16 ***
## I(1/dist_ref^2)       8.588e-06  8.110e-07  10.589  < 2e-16 ***
## I(1/dist_dc^2)       -4.173e-04  1.314e-05 -31.751  < 2e-16 ***
## monthly_oil_2km       7.987e-06  2.177e-06   3.668 0.000244 ***
## monthly_gas_2km      -3.974e-05  6.338e-06  -6.270 3.61e-10 ***
## active_2km            1.475e-02  1.744e-03   8.456  < 2e-16 ***
## inactive_2km         -3.677e-02  4.986e-03  -7.375 1.65e-13 ***
## elevation            -3.057e-02  3.671e-03  -8.329  < 2e-16 ***
## EVI                  -1.467e+00  8.117e-02 -18.072  < 2e-16 ***
## num_odor_complaints   1.398e-02  7.215e-04  19.381  < 2e-16 ***
## closest_wrp_capacity -3.027e-03  4.255e-04  -7.114 1.13e-12 ***
## hourly_downwind_ref  -2.855e-02  5.411e-03  -5.277 1.32e-07 ***
## hourly_downwind_wrp   1.910e-02  6.253e-03   3.054 0.002258 ** 
## hourly_temp          -3.228e-02  3.322e-04 -97.181  < 2e-16 ***
## hourly_hum           -9.569e-03  1.181e-04 -81.050  < 2e-16 ***
## hourly_precip         4.285e-01  1.009e-01   4.249 2.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                             edf Ref.df       F
## s(as.numeric(month))                                     0.8031  2.000   0.013
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.2526  8.253 148.921
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.9926 80.000 394.272
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 128/133
## R-sq.(adj) =  0.352   Deviance explained = 35.3%
## GCV = 0.33353  Scale est. = 0.33327   n = 153718

Disaster Only

# Disaster only
summary(ha_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month + 
##     weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + 
##     I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.439e+02  2.504e+02   1.373 0.169667    
## month11              -5.068e+00  5.033e+00  -1.007 0.313944    
## month12              -1.720e+00  5.452e+00  -0.316 0.752358    
## weekdayMon           -3.528e+00  2.586e+00  -1.364 0.172618    
## weekdayTue           -6.387e+00  2.595e+00  -2.461 0.013857 *  
## weekdayWed           -1.574e+00  2.562e+00  -0.615 0.538825    
## weekdayThu            1.794e+00  2.558e+00   0.701 0.483077    
## weekdayFri            2.918e+00  2.502e+00   1.166 0.243428    
## weekdaySat            2.057e+00  2.531e+00   0.813 0.416351    
## wd_avg               -2.871e-02  6.943e-03  -4.135 3.56e-05 ***
## ws_avg                1.454e+00  3.702e-01   3.929 8.55e-05 ***
## I(1/dist_wrp^2)      -6.655e-04  1.696e-04  -3.925 8.71e-05 ***
## I(1/dist_ref^2)      -1.667e-03  2.781e-03  -0.599 0.548960    
## I(1/dist_dc^2)        5.893e+00  1.665e+00   3.539 0.000402 ***
## monthly_oil_2km      -1.799e-02  8.448e-03  -2.130 0.033219 *  
## monthly_gas_2km      -2.634e-02  2.249e-02  -1.172 0.241372    
## active_2km           -1.138e+01  1.964e+00  -5.795 6.91e-09 ***
## inactive_2km          7.207e+01  6.829e+00  10.554  < 2e-16 ***
## elevation            -3.683e+01  1.824e+00 -20.188  < 2e-16 ***
## EVI                  -6.007e+02  4.246e+01 -14.146  < 2e-16 ***
## num_odor_complaints  -1.253e+00  6.419e-02 -19.512  < 2e-16 ***
## closest_wrp_capacity  1.175e+00  4.648e-01   2.527 0.011497 *  
## hourly_downwind_wrp   7.409e+00  2.649e+00   2.797 0.005164 ** 
## hourly_temp          -8.394e-01  1.383e-01  -6.071 1.29e-09 ***
## hourly_hum           -1.086e-01  4.156e-02  -2.613 0.008980 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.707  8.932 83.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.390 80.000 63.25
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =   0.21   Deviance explained = 21.3%
## GCV =  13802  Scale est. = 13753     n = 30242

Exclude Disaster

# Exclude disaster
summary(ha_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_gas_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           5.171e+00  1.228e-01   42.102  < 2e-16 ***
## month02              -3.275e-01  1.861e-02  -17.599  < 2e-16 ***
## month03              -4.058e-01  3.220e-02  -12.602  < 2e-16 ***
## month04              -3.016e-01  4.275e-02   -7.055 1.74e-12 ***
## month05              -2.952e-01  4.988e-02   -5.919 3.25e-09 ***
## month06              -2.669e-01  5.353e-02   -4.987 6.14e-07 ***
## month07              -1.019e-01  5.360e-02   -1.902 0.057209 .  
## month08               7.398e-02  5.016e-02    1.475 0.140227    
## month09               1.094e-01  4.321e-02    2.531 0.011388 *  
## month10              -2.459e-02  3.323e-02   -0.740 0.459290    
## month11              -7.539e-02  1.997e-02   -3.775 0.000160 ***
## month12               5.687e-03  8.501e-03    0.669 0.503519    
## weekdayMon            8.491e-02  4.871e-03   17.430  < 2e-16 ***
## weekdayTue            1.449e-01  4.831e-03   29.991  < 2e-16 ***
## weekdayWed            1.670e-01  4.827e-03   34.594  < 2e-16 ***
## weekdayThu            1.527e-01  4.826e-03   31.636  < 2e-16 ***
## weekdayFri            1.416e-01  4.827e-03   29.336  < 2e-16 ***
## weekdaySat            6.808e-02  4.818e-03   14.130  < 2e-16 ***
## wd_avg               -3.939e-04  1.531e-05  -25.719  < 2e-16 ***
## ws_avg               -3.267e-02  5.122e-04  -63.792  < 2e-16 ***
## I(1/dist_wrp^2)       5.650e-07  1.133e-07    4.988 6.12e-07 ***
## I(1/dist_ref^2)      -1.923e-06  5.686e-06   -0.338 0.735164    
## I(1/dist_dc^2)        3.696e-04  7.516e-05    4.917 8.78e-07 ***
## monthly_gas_2km      -4.793e-05  3.706e-06  -12.935  < 2e-16 ***
## active_2km            7.070e-03  8.514e-04    8.303  < 2e-16 ***
## inactive_2km         -1.998e-03  2.628e-03   -0.760 0.447140    
## elevation            -1.247e-02  1.107e-03  -11.261  < 2e-16 ***
## EVI                  -1.395e+00  4.691e-02  -29.745  < 2e-16 ***
## num_odor_complaints   5.820e-03  3.757e-04   15.493  < 2e-16 ***
## closest_wrp_capacity -2.193e-03  2.696e-04   -8.133 4.21e-16 ***
## hourly_downwind_ref  -5.354e-02  4.630e-03  -11.566  < 2e-16 ***
## hourly_downwind_wrp   2.387e-02  5.121e-03    4.662 3.13e-06 ***
## hourly_temp          -3.283e-02  2.799e-04 -117.273  < 2e-16 ***
## hourly_hum           -9.138e-03  9.611e-05  -95.077  < 2e-16 ***
## hourly_precip        -4.412e-01  1.143e-01   -3.859 0.000114 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -5.732e-12      1   0.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.989e+00      9 317.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.997e+01     80 470.4
##                                                          p-value    
## s(as.numeric(month))                                    1.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/132
## R-sq.(adj) =  0.246   Deviance explained = 24.6%
## GCV = 0.56167  Scale est. = 0.56147   n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(ha_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.461e+02  4.269e+01  -3.422 0.000621 ***
## month02              -2.079e-01  2.771e+01  -0.008 0.994014    
## month03              -5.634e-01  4.992e+01  -0.011 0.990995    
## month04              -4.432e-01  6.664e+01  -0.007 0.994694    
## month05              -6.434e-01  7.784e+01  -0.008 0.993406    
## month06               5.551e-02  8.348e+01   0.001 0.999469    
## month07               3.045e-01  8.348e+01   0.004 0.997090    
## month08               9.068e-01  7.784e+01   0.012 0.990705    
## month09               1.462e+00  6.664e+01   0.022 0.982495    
## month10               4.600e+00  4.992e+01   0.092 0.926590    
## month11              -3.497e+00  2.771e+01  -0.126 0.899592    
## month12              -3.341e+00  3.976e-01  -8.402  < 2e-16 ***
## weekdayMon           -6.853e-01  2.303e-01  -2.975 0.002927 ** 
## weekdayTue           -6.205e-01  2.285e-01  -2.716 0.006615 ** 
## weekdayWed           -3.120e-01  2.282e-01  -1.367 0.171625    
## weekdayThu           -8.171e-02  2.282e-01  -0.358 0.720303    
## weekdayFri            2.022e-01  2.279e-01   0.887 0.375079    
## weekdaySat            1.024e-01  2.278e-01   0.449 0.653151    
## wd_avg               -7.110e-03  7.129e-04  -9.973  < 2e-16 ***
## ws_avg                1.418e-01  2.452e-02   5.782 7.39e-09 ***
## I(1/dist_wrp^2)       4.580e-06  3.929e-06   1.166 0.243733    
## I(1/dist_ref^2)      -2.979e-03  4.154e-04  -7.171 7.46e-13 ***
## I(1/dist_dc^2)        2.296e-01  2.938e-02   7.816 5.48e-15 ***
## monthly_oil_2km       3.011e-04  8.786e-05   3.427 0.000611 ***
## monthly_gas_2km       6.389e-07  1.795e-04   0.004 0.997161    
## active_2km            2.318e-01  4.148e-02   5.588 2.30e-08 ***
## inactive_2km          2.514e+00  1.166e-01  21.554  < 2e-16 ***
## elevation            -1.335e+00  5.276e-02 -25.295  < 2e-16 ***
## EVI                  -5.344e+01  2.178e+00 -24.539  < 2e-16 ***
## num_odor_complaints   5.729e-01  1.002e-02  57.162  < 2e-16 ***
## closest_wrp_capacity  4.007e-01  1.447e-02  27.684  < 2e-16 ***
## hourly_downwind_ref  -1.995e+00  2.153e-01  -9.268  < 2e-16 ***
## hourly_downwind_wrp   1.523e+00  2.415e-01   6.308 2.83e-10 ***
## hourly_temp          -2.245e-01  1.276e-02 -17.591  < 2e-16 ***
## hourly_hum           -5.432e-02  4.319e-03 -12.579  < 2e-16 ***
## hourly_precip        -2.470e+00  4.969e+00  -0.497 0.619039    
## disaster              7.970e+00  5.600e-01  14.231  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    -5.76e-09      1  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00e+00      9 84.69
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.75e+01     80 79.15
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 131/134
## R-sq.(adj) =  0.0488   Deviance explained = 4.91%
## GCV = 1367.6  Scale est. = 1367.2    n = 367838

Everything w.o Disaster Indicator

# Everything
summary(ha_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.421e+02  4.271e+01  -3.328 0.000874 ***
## month02              -3.171e-01  2.772e+01  -0.011 0.990873    
## month03              -6.998e-01  4.993e+01  -0.014 0.988819    
## month04              -6.096e-01  6.666e+01  -0.009 0.992704    
## month05              -8.162e-01  7.786e+01  -0.010 0.991637    
## month06              -3.188e-02  8.350e+01   0.000 0.999695    
## month07               3.831e-01  8.350e+01   0.005 0.996339    
## month08               1.163e+00  7.786e+01   0.015 0.988083    
## month09               1.786e+00  6.666e+01   0.027 0.978623    
## month10               7.730e+00  4.994e+01   0.155 0.876973    
## month11              -5.089e-01  2.772e+01  -0.018 0.985353    
## month12              -5.331e-01  3.454e-01  -1.543 0.122736    
## weekdayMon           -6.847e-01  2.304e-01  -2.972 0.002958 ** 
## weekdayTue           -6.158e-01  2.286e-01  -2.694 0.007052 ** 
## weekdayWed           -3.131e-01  2.283e-01  -1.371 0.170285    
## weekdayThu           -9.952e-02  2.283e-01  -0.436 0.662819    
## weekdayFri            1.851e-01  2.280e-01   0.812 0.416865    
## weekdaySat            8.977e-02  2.279e-01   0.394 0.693623    
## wd_avg               -7.143e-03  7.131e-04 -10.016  < 2e-16 ***
## ws_avg                1.408e-01  2.453e-02   5.739 9.53e-09 ***
## I(1/dist_wrp^2)       5.879e-06  3.990e-06   1.473 0.140624    
## I(1/dist_ref^2)      -2.993e-03  4.165e-04  -7.186 6.70e-13 ***
## I(1/dist_dc^2)        2.254e-01  2.939e-02   7.667 1.76e-14 ***
## monthly_oil_2km       3.395e-04  8.803e-05   3.857 0.000115 ***
## monthly_gas_2km       9.856e-06  1.814e-04   0.054 0.956678    
## active_2km            2.019e-01  4.147e-02   4.869 1.12e-06 ***
## inactive_2km          2.564e+00  1.168e-01  21.962  < 2e-16 ***
## elevation            -1.331e+00  5.280e-02 -25.207  < 2e-16 ***
## EVI                  -5.316e+01  2.179e+00 -24.396  < 2e-16 ***
## num_odor_complaints   5.810e-01  1.001e-02  58.048  < 2e-16 ***
## closest_wrp_capacity  4.048e-01  1.449e-02  27.939  < 2e-16 ***
## hourly_downwind_ref  -2.041e+00  2.153e-01  -9.481  < 2e-16 ***
## hourly_downwind_wrp   1.559e+00  2.415e-01   6.453 1.10e-10 ***
## hourly_temp          -2.276e-01  1.277e-02 -17.831  < 2e-16 ***
## hourly_hum           -5.672e-02  4.317e-03 -13.140  < 2e-16 ***
## hourly_precip        -1.979e+00  4.970e+00  -0.398 0.690535    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -1.049e-09      2  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000e+00      9 86.14
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.768e+01     80 76.89
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 130/133
## R-sq.(adj) =  0.0483   Deviance explained = 4.86%
## GCV = 1368.4  Scale est. = 1367.9    n = 367838

Log Hourly Avg

Since February 2022

# Since feb 2022
summary(log_ha_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.207e+00  7.312e-01   4.386 1.16e-05 ***
## month02              3.402e-02  4.379e-01   0.078 0.938072    
## month03              2.133e-01  7.887e-01   0.270 0.786847    
## month04              3.888e-01  1.053e+00   0.369 0.711880    
## month05              2.624e-01  1.230e+00   0.213 0.830997    
## month06              1.090e-01  1.319e+00   0.083 0.934136    
## month07             -2.448e-02  1.319e+00  -0.019 0.985191    
## month08             -3.735e-02  1.230e+00  -0.030 0.975770    
## month09              8.082e-02  1.053e+00   0.077 0.938801    
## month10             -7.892e-02  7.887e-01  -0.100 0.920293    
## month11             -1.731e-01  4.380e-01  -0.395 0.692690    
## month12             -2.351e-01  1.206e-02 -19.498  < 2e-16 ***
## weekdayMon           9.059e-02  5.295e-03  17.109  < 2e-16 ***
## weekdayTue           1.575e-01  5.248e-03  30.001  < 2e-16 ***
## weekdayWed           1.514e-01  5.261e-03  28.777  < 2e-16 ***
## weekdayThu           1.084e-01  5.264e-03  20.600  < 2e-16 ***
## weekdayFri           1.266e-01  5.245e-03  24.141  < 2e-16 ***
## weekdaySat           7.732e-02  5.230e-03  14.785  < 2e-16 ***
## wd_avg              -3.798e-04  1.682e-05 -22.580  < 2e-16 ***
## ws_avg              -6.208e-02  6.553e-04 -94.732  < 2e-16 ***
## I(1/dist_wrp^2)      1.276e-06  6.672e-08  19.120  < 2e-16 ***
## I(1/dist_ref^2)     -1.255e-05  3.232e-06  -3.883 0.000103 ***
## I(1/dist_dc^2)      -6.694e-04  2.379e-05 -28.136  < 2e-16 ***
## monthly_oil_2km      3.505e-07  2.070e-06   0.169 0.865544    
## monthly_gas_2km     -4.783e-05  6.025e-06  -7.939 2.05e-15 ***
## active_2km           1.795e-02  1.658e-03  10.825  < 2e-16 ***
## inactive_2km        -6.403e-03  3.469e-03  -1.845 0.064979 .  
## elevation           -4.601e-02  2.711e-03 -16.971  < 2e-16 ***
## EVI                 -2.373e+00  6.234e-02 -38.067  < 2e-16 ***
## num_odor_complaints  1.437e-02  6.859e-04  20.948  < 2e-16 ***
## hourly_downwind_ref -7.808e-03  5.144e-03  -1.518 0.129064    
## hourly_downwind_wrp  4.748e-02  5.945e-03   7.986 1.40e-15 ***
## hourly_temp         -3.114e-02  3.158e-04 -98.625  < 2e-16 ***
## hourly_hum          -9.724e-03  1.122e-04 -86.645  < 2e-16 ***
## hourly_precip        1.964e-01  9.588e-02   2.048 0.040524 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df      F
## s(as.numeric(month))                                    1.153e-10      0    Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  9.000e+00      9  341.1
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 8.000e+01     80 1154.9
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/132
## R-sq.(adj) =  0.548   Deviance explained = 54.8%
## GCV = 0.30141  Scale est. = 0.30117   n = 153718

Disaster Only

# Disaster only
summary(log_ha_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.338e+01  1.863e+00 -12.550  < 2e-16 ***
## month11               1.553e-01  3.505e-02   4.431 9.39e-06 ***
## month12              -1.110e-01  3.620e-02  -3.065  0.00218 ** 
## weekdayMon            2.516e-02  1.808e-02   1.392  0.16397    
## weekdayTue           -9.922e-02  1.814e-02  -5.470 4.54e-08 ***
## weekdayWed            1.126e-01  1.791e-02   6.291 3.20e-10 ***
## weekdayThu            9.509e-02  1.806e-02   5.265 1.41e-07 ***
## weekdayFri            3.090e-02  1.749e-02   1.767  0.07732 .  
## weekdaySat           -8.340e-02  1.769e-02  -4.714 2.44e-06 ***
## wd_avg               -7.720e-04  4.893e-05 -15.778  < 2e-16 ***
## ws_avg               -1.110e-01  2.666e-03 -41.647  < 2e-16 ***
## I(1/dist_wrp^2)      -4.535e-05  3.437e-06 -13.194  < 2e-16 ***
## I(1/dist_ref^2)      -2.659e-04  1.972e-05 -13.487  < 2e-16 ***
## I(1/dist_dc^2)        3.288e-01  2.425e-02  13.559  < 2e-16 ***
## monthly_oil_2km       6.284e-04  4.022e-05  15.627  < 2e-16 ***
## active_2km           -3.871e-01  1.253e-02 -30.882  < 2e-16 ***
## inactive_2km          6.700e-01  4.309e-02  15.551  < 2e-16 ***
## elevation            -2.089e-01  1.147e-02 -18.209  < 2e-16 ***
## num_odor_complaints   4.279e-03  4.510e-04   9.489  < 2e-16 ***
## closest_wrp_capacity  6.845e-02  3.546e-03  19.302  < 2e-16 ***
## hourly_downwind_ref  -1.641e-01  1.466e-02 -11.197  < 2e-16 ***
## hourly_downwind_wrp   2.034e-01  1.867e-02  10.898  < 2e-16 ***
## hourly_temp          -3.594e-02  9.671e-04 -37.166  < 2e-16 ***
## hourly_hum           -1.034e-02  2.939e-04 -35.177  < 2e-16 ***
## hourly_precip         1.048e+00  2.535e-01   4.134 3.57e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 250.62
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.37     80  88.83
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =  0.583   Deviance explained = 58.5%
## GCV = 0.67426  Scale est. = 0.6718    n = 30242

Exclude Disaster

# Exclude disaster
summary(log_ha_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           3.553e+00  1.233e-01   28.809  < 2e-16 ***
## month02              -3.315e-01  1.317e-02  -25.166  < 2e-16 ***
## month03              -4.046e-01  2.238e-02  -18.079  < 2e-16 ***
## month04              -2.607e-01  2.963e-02   -8.798  < 2e-16 ***
## month05              -1.949e-01  3.455e-02   -5.642 1.69e-08 ***
## month06              -1.509e-01  3.709e-02   -4.068 4.74e-05 ***
## month07               4.910e-02  3.716e-02    1.321 0.186406    
## month08               2.784e-01  3.482e-02    7.994 1.31e-15 ***
## month09               3.001e-01  3.009e-02    9.973  < 2e-16 ***
## month10               3.470e-03  2.341e-02    0.148 0.882168    
## month11              -8.855e-03  1.447e-02   -0.612 0.540524    
## month12               6.263e-03  7.087e-03    0.884 0.376852    
## weekdayMon            9.329e-02  4.061e-03   22.974  < 2e-16 ***
## weekdayTue            1.470e-01  4.027e-03   36.502  < 2e-16 ***
## weekdayWed            1.691e-01  4.023e-03   42.040  < 2e-16 ***
## weekdayThu            1.539e-01  4.023e-03   38.254  < 2e-16 ***
## weekdayFri            1.443e-01  4.024e-03   35.860  < 2e-16 ***
## weekdaySat            7.404e-02  4.016e-03   18.436  < 2e-16 ***
## wd_avg               -6.137e-04  1.277e-05  -48.073  < 2e-16 ***
## ws_avg               -3.843e-02  4.269e-04  -90.028  < 2e-16 ***
## I(1/dist_wrp^2)       2.574e-06  1.140e-07   22.583  < 2e-16 ***
## I(1/dist_ref^2)      -1.092e-04  4.979e-06  -21.930  < 2e-16 ***
## I(1/dist_dc^2)        8.109e-04  1.821e-04    4.454 8.43e-06 ***
## monthly_oil_2km      -4.300e-06  1.662e-06   -2.588 0.009662 ** 
## monthly_gas_2km      -5.942e-05  3.248e-06  -18.295  < 2e-16 ***
## active_2km            1.841e-02  7.543e-04   24.403  < 2e-16 ***
## inactive_2km         -7.903e-03  2.199e-03   -3.594 0.000326 ***
## elevation            -6.017e-03  9.253e-04   -6.503 7.89e-11 ***
## EVI                  -2.287e+00  3.914e-02  -58.419  < 2e-16 ***
## num_odor_complaints   2.953e-03  3.132e-04    9.427  < 2e-16 ***
## closest_wrp_capacity -2.429e-03  2.687e-04   -9.041  < 2e-16 ***
## hourly_downwind_ref  -3.675e-02  3.859e-03   -9.523  < 2e-16 ***
## hourly_downwind_wrp   6.508e-02  4.268e-03   15.246  < 2e-16 ***
## hourly_temp          -3.036e-02  2.333e-04 -130.116  < 2e-16 ***
## hourly_hum           -8.381e-03  8.012e-05 -104.604  < 2e-16 ***
## hourly_precip        -7.615e-01  9.530e-02   -7.991 1.34e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df    F
## s(as.numeric(month))                                    5.881e-12      0  Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.998e+00      9 1217
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01     80 1558
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 130/133
## R-sq.(adj) =  0.442   Deviance explained = 44.3%
## GCV = 0.39025  Scale est. = 0.3901    n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(log_ha_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip + 
##     disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.077e+00  2.482e-01    4.338 1.44e-05 ***
## month02              -2.238e-01  6.918e-01   -0.324    0.746    
## month03              -2.337e-01  8.715e-01   -0.268    0.789    
## month04              -6.518e-02  7.124e-01   -0.091    0.927    
## month05              -2.003e-03  5.753e-01   -0.003    0.997    
## month06              -3.908e-03  6.314e-01   -0.006    0.995    
## month07               1.030e-01  6.307e-01    0.163    0.870    
## month08               2.540e-01  5.741e-01    0.442    0.658    
## month09               2.208e-01  7.120e-01    0.310    0.756    
## month10               2.476e-02  8.714e-01    0.028    0.977    
## month11              -1.114e-01  6.918e-01   -0.161    0.872    
## month12              -1.504e-01  7.296e-03  -20.616  < 2e-16 ***
## weekdayMon            8.104e-02  4.221e-03   19.200  < 2e-16 ***
## weekdayTue            1.197e-01  4.187e-03   28.589  < 2e-16 ***
## weekdayWed            1.606e-01  4.182e-03   38.411  < 2e-16 ***
## weekdayThu            1.438e-01  4.182e-03   34.380  < 2e-16 ***
## weekdayFri            1.314e-01  4.177e-03   31.452  < 2e-16 ***
## weekdaySat            6.026e-02  4.175e-03   14.435  < 2e-16 ***
## wd_avg               -6.880e-04  1.306e-05  -52.657  < 2e-16 ***
## ws_avg               -4.069e-02  4.494e-04  -90.536  < 2e-16 ***
## I(1/dist_wrp^2)       1.648e-06  1.034e-07   15.935  < 2e-16 ***
## I(1/dist_ref^2)      -1.252e-04  4.855e-06  -25.789  < 2e-16 ***
## I(1/dist_dc^2)        6.981e-03  2.977e-04   23.451  < 2e-16 ***
## monthly_oil_2km      -7.660e-06  1.570e-06   -4.880 1.06e-06 ***
## active_2km            2.180e-02  7.782e-04   28.007  < 2e-16 ***
## inactive_2km          2.259e-02  2.248e-03   10.047  < 2e-16 ***
## elevation            -3.036e-02  9.705e-04  -31.285  < 2e-16 ***
## EVI                  -3.065e+00  3.985e-02  -76.922  < 2e-16 ***
## num_odor_complaints   1.222e-02  1.837e-04   66.535  < 2e-16 ***
## closest_wrp_capacity  3.715e-03  2.673e-04   13.898  < 2e-16 ***
## hourly_downwind_ref  -6.755e-02  3.946e-03  -17.120  < 2e-16 ***
## hourly_downwind_wrp   8.569e-02  4.422e-03   19.377  < 2e-16 ***
## hourly_temp          -3.332e-02  2.339e-04 -142.468  < 2e-16 ***
## hourly_hum           -9.015e-03  7.914e-05 -113.911  < 2e-16 ***
## hourly_precip        -9.368e-01  9.106e-02  -10.287  < 2e-16 ***
## disaster              4.800e-01  1.028e-02   46.710  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df        F
## s(as.numeric(month))                                     1.354      6    0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.988      9  734.615
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.968     80 1476.695
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 128/133
## R-sq.(adj) =  0.438   Deviance explained = 43.8%
## GCV = 0.45927  Scale est. = 0.45912   n = 367838

Everything w.o Disaster Indicator

# Everything
summary(log_ha_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + active_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.279e+00  2.489e-01    5.140 2.75e-07 ***
## month02              -2.113e-01  6.939e-01   -0.305  0.76072    
## month03              -2.096e-01  8.740e-01   -0.240  0.81052    
## month04              -3.973e-02  7.145e-01   -0.056  0.95565    
## month05               1.578e-02  5.770e-01    0.027  0.97818    
## month06               3.355e-03  6.333e-01    0.005  0.99577    
## month07               1.014e-01  6.326e-01    0.160  0.87263    
## month08               2.469e-01  5.758e-01    0.429  0.66813    
## month09               2.091e-01  7.142e-01    0.293  0.76965    
## month10               1.841e-01  8.740e-01    0.211  0.83315    
## month11               5.095e-02  6.939e-01    0.073  0.94146    
## month12               1.905e-02  6.349e-03    3.000  0.00270 ** 
## weekdayMon            8.107e-02  4.233e-03   19.151  < 2e-16 ***
## weekdayTue            1.200e-01  4.200e-03   28.568  < 2e-16 ***
## weekdayWed            1.606e-01  4.195e-03   38.275  < 2e-16 ***
## weekdayThu            1.427e-01  4.194e-03   34.018  < 2e-16 ***
## weekdayFri            1.303e-01  4.189e-03   31.112  < 2e-16 ***
## weekdaySat            5.951e-02  4.187e-03   14.212  < 2e-16 ***
## wd_avg               -6.895e-04  1.310e-05  -52.619  < 2e-16 ***
## ws_avg               -4.077e-02  4.508e-04  -90.448  < 2e-16 ***
## I(1/dist_wrp^2)       1.883e-06  1.026e-07   18.344  < 2e-16 ***
## I(1/dist_ref^2)      -1.327e-04  4.890e-06  -27.133  < 2e-16 ***
## I(1/dist_dc^2)        6.616e-03  2.917e-04   22.682  < 2e-16 ***
## monthly_oil_2km      -5.366e-06  1.574e-06   -3.410  0.00065 ***
## active_2km            1.967e-02  7.795e-04   25.237  < 2e-16 ***
## inactive_2km          2.720e-02  2.255e-03   12.058  < 2e-16 ***
## elevation            -3.014e-02  9.736e-04  -30.961  < 2e-16 ***
## EVI                  -3.047e+00  3.998e-02  -76.224  < 2e-16 ***
## num_odor_complaints   1.272e-02  1.839e-04   69.151  < 2e-16 ***
## closest_wrp_capacity  4.033e-03  2.682e-04   15.037  < 2e-16 ***
## hourly_downwind_ref  -7.040e-02  3.957e-03  -17.793  < 2e-16 ***
## hourly_downwind_wrp   8.775e-02  4.435e-03   19.784  < 2e-16 ***
## hourly_temp          -3.350e-02  2.346e-04 -142.832  < 2e-16 ***
## hourly_hum           -9.160e-03  7.932e-05 -115.485  < 2e-16 ***
## hourly_precip        -9.074e-01  9.133e-02   -9.935  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df        F
## s(as.numeric(month))                                     1.354      6    0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.994      9  740.199
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.984     80 1445.150
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 127/132
## R-sq.(adj) =  0.435   Deviance explained = 43.5%
## GCV = 0.46199  Scale est. = 0.46184   n = 367838

Hourly Max

Since February 2022

# Since feb 2022
summary(hm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + inactive_2km + elevation + 
##     EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.792e+00  5.587e-01  12.158  < 2e-16 ***
## month02               4.507e-02  1.723e+00   0.026 0.979127    
## month03               3.985e-01  2.172e+00   0.183 0.854442    
## month04               6.758e-01  1.785e+00   0.379 0.705020    
## month05               6.962e-01  1.450e+00   0.480 0.631039    
## month06               6.309e-01  1.576e+00   0.400 0.688975    
## month07               4.864e-01  1.563e+00   0.311 0.755711    
## month08               4.418e-01  1.428e+00   0.309 0.756992    
## month09               4.402e-01  1.780e+00   0.247 0.804686    
## month10               1.809e-01  2.174e+00   0.083 0.933703    
## month11              -6.956e-02  1.723e+00  -0.040 0.967804    
## month12              -1.587e-01  2.603e-02  -6.098 1.08e-09 ***
## weekdayMon            1.238e-01  1.143e-02  10.831  < 2e-16 ***
## weekdayTue            2.128e-01  1.133e-02  18.777  < 2e-16 ***
## weekdayWed            2.199e-01  1.136e-02  19.365  < 2e-16 ***
## weekdayThu            1.682e-01  1.136e-02  14.799  < 2e-16 ***
## weekdayFri            1.990e-01  1.132e-02  17.578  < 2e-16 ***
## weekdaySat            9.236e-02  1.129e-02   8.180 2.86e-16 ***
## wd_avg               -7.348e-05  3.631e-05  -2.024 0.043009 *  
## ws_avg               -6.778e-02  1.415e-03 -47.908  < 2e-16 ***
## I(1/dist_wrp^2)       8.233e-07  1.345e-07   6.122 9.26e-10 ***
## I(1/dist_ref^2)       2.110e-05  1.770e-06  11.926  < 2e-16 ***
## I(1/dist_dc^2)       -4.611e-04  2.916e-05 -15.813  < 2e-16 ***
## monthly_oil_2km       1.332e-05  4.208e-06   3.166 0.001547 ** 
## monthly_gas_2km      -5.650e-08  1.269e-05  -0.004 0.996449    
## inactive_2km         -2.039e-02  9.167e-03  -2.225 0.026105 *  
## elevation            -4.044e-02  7.753e-03  -5.217 1.82e-07 ***
## EVI                  -1.419e+00  1.512e-01  -9.382  < 2e-16 ***
## num_odor_complaints   3.459e-02  1.481e-03  23.359  < 2e-16 ***
## closest_wrp_capacity -4.057e-03  9.017e-04  -4.499 6.82e-06 ***
## hourly_downwind_ref  -4.650e-02  1.111e-02  -4.187 2.82e-05 ***
## hourly_downwind_wrp   4.576e-02  1.283e-02   3.565 0.000363 ***
## hourly_temp          -3.825e-02  6.815e-04 -56.123  < 2e-16 ***
## hourly_hum           -1.149e-02  2.423e-04 -47.413  < 2e-16 ***
## hourly_precip         1.078e+00  2.070e-01   5.206 1.93e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df       F
## s(as.numeric(month))                                     1.351  3.000   0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.253  8.253  62.118
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.976 80.000 130.311
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 126/132
## R-sq.(adj) =  0.175   Deviance explained = 17.5%
## GCV = 1.4048  Scale est. = 1.4037    n = 153718

Disaster Only

# Disaster only
summary(hm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", 
##     k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##     k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month + 
##     weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + 
##     I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km + 
##     inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.751e+02  5.365e+02   1.445 0.148540    
## month11              -1.138e+01  1.069e+01  -1.065 0.286956    
## month12              -6.246e-01  1.157e+01  -0.054 0.956966    
## weekdayMon           -4.502e+00  5.492e+00  -0.820 0.412317    
## weekdayTue           -1.019e+01  5.510e+00  -1.850 0.064385 .  
## weekdayWed           -7.691e-01  5.439e+00  -0.141 0.887550    
## weekdayThu            5.550e+00  5.432e+00   1.022 0.306954    
## weekdayFri            9.091e+00  5.312e+00   1.711 0.087021 .  
## weekdaySat            7.882e+00  5.374e+00   1.467 0.142518    
## wd_avg               -7.400e-02  1.474e-02  -5.020 5.21e-07 ***
## ws_avg                3.090e+00  7.859e-01   3.932 8.45e-05 ***
## I(1/dist_wrp^2)      -1.485e-03  3.691e-04  -4.023 5.76e-05 ***
## I(1/dist_ref^2)      -3.664e-03  6.026e-03  -0.608 0.543217    
## I(1/dist_dc^2)        1.315e+01  3.620e+00   3.632 0.000282 ***
## monthly_oil_2km      -3.991e-02  1.803e-02  -2.213 0.026905 *  
## monthly_gas_2km      -6.194e-02  4.771e-02  -1.298 0.194215    
## active_2km           -2.612e+01  4.175e+00  -6.257 3.97e-10 ***
## inactive_2km          1.629e+02  1.456e+01  11.184  < 2e-16 ***
## elevation            -8.281e+01  3.893e+00 -21.274  < 2e-16 ***
## EVI                  -1.337e+03  9.018e+01 -14.826  < 2e-16 ***
## num_odor_complaints  -2.810e+00  1.361e-01 -20.642  < 2e-16 ***
## closest_wrp_capacity  2.611e+00  9.956e-01   2.623 0.008732 ** 
## hourly_downwind_wrp   1.212e+01  5.625e+00   2.155 0.031166 *  
## hourly_temp          -1.725e+00  2.936e-01  -5.876 4.25e-09 ***
## hourly_hum           -2.430e-01  8.825e-02  -2.754 0.005899 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.723  8.939 92.02
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.248 80.000 68.80
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =  0.223   Deviance explained = 22.6%
## GCV =  62223  Scale est. = 61999     n = 30242

Exclude Disaster

# Exclude disaster
summary(hm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     active_2km + inactive_2km + elevation + EVI + num_odor_complaints + 
##     closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp + 
##     hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.053e+00  7.042e+00   1.143  0.25284    
## month02              -3.310e-01  4.340e+00  -0.076  0.93921    
## month03              -4.146e-01  7.819e+00  -0.053  0.95771    
## month04              -2.932e-01  1.044e+01  -0.028  0.97759    
## month05              -2.902e-01  1.219e+01  -0.024  0.98101    
## month06              -2.612e-01  1.308e+01  -0.020  0.98406    
## month07              -7.371e-02  1.308e+01  -0.006  0.99550    
## month08               1.403e-01  1.219e+01   0.012  0.99082    
## month09               1.825e-01  1.044e+01   0.017  0.98605    
## month10               6.698e-03  7.819e+00   0.001  0.99932    
## month11              -5.259e-02  4.340e+00  -0.012  0.99033    
## month12               3.365e-02  1.446e-02   2.327  0.01998 *  
## weekdayMon            1.364e-01  8.303e-03  16.431  < 2e-16 ***
## weekdayTue            2.178e-01  8.235e-03  26.448  < 2e-16 ***
## weekdayWed            2.374e-01  8.227e-03  28.860  < 2e-16 ***
## weekdayThu            2.182e-01  8.226e-03  26.529  < 2e-16 ***
## weekdayFri            2.048e-01  8.228e-03  24.886  < 2e-16 ***
## weekdaySat            9.257e-02  8.213e-03  11.272  < 2e-16 ***
## wd_avg               -3.425e-04  2.610e-05 -13.121  < 2e-16 ***
## ws_avg               -4.396e-02  8.727e-04 -50.375  < 2e-16 ***
## I(1/dist_wrp^2)      -6.324e-01  2.381e-02 -26.557  < 2e-16 ***
## I(1/dist_ref^2)       5.842e+00  2.200e-01  26.557  < 2e-16 ***
## I(1/dist_dc^2)        4.251e+03  1.601e+02  26.557  < 2e-16 ***
## active_2km            7.428e-03  1.426e-03   5.210 1.89e-07 ***
## inactive_2km         -4.958e-02  4.421e-03 -11.214  < 2e-16 ***
## elevation            -4.963e-03  1.910e-03  -2.598  0.00937 ** 
## EVI                  -1.109e+00  8.058e-02 -13.761  < 2e-16 ***
## num_odor_complaints   1.379e-02  6.403e-04  21.532  < 2e-16 ***
## closest_wrp_capacity -7.233e-03  4.527e-04 -15.976  < 2e-16 ***
## hourly_downwind_ref  -6.319e-02  7.897e-03  -8.002 1.23e-15 ***
## hourly_downwind_wrp   4.382e-02  8.722e-03   5.025 5.05e-07 ***
## hourly_temp          -3.845e-02  4.771e-04 -80.601  < 2e-16 ***
## hourly_hum           -1.090e-02  1.638e-04 -66.542  < 2e-16 ***
## hourly_precip        -7.501e-02  1.949e-01  -0.385  0.70030    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -1.219e-07  3.000   0.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.925e+00  8.989 219.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.969e+01 80.000 214.2
##                                                         p-value    
## s(as.numeric(month))                                          1    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/131
## R-sq.(adj) =  0.148   Deviance explained = 14.9%
## GCV = 1.6319  Scale est. = 1.6313    n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(hm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.265e+02  9.130e+01  -3.577 0.000348 ***
## month02              -4.106e-02  5.926e+01  -0.001 0.999447    
## month03              -6.275e-01  1.068e+02  -0.006 0.995310    
## month04              -4.214e-01  1.425e+02  -0.003 0.997641    
## month05              -8.743e-01  1.665e+02  -0.005 0.995810    
## month06               5.939e-01  1.785e+02   0.003 0.997346    
## month07               9.327e-01  1.785e+02   0.005 0.995831    
## month08               2.005e+00  1.665e+02   0.012 0.990392    
## month09               3.129e+00  1.425e+02   0.022 0.982481    
## month10               9.828e+00  1.068e+02   0.092 0.926656    
## month11              -7.470e+00  5.926e+01  -0.126 0.899701    
## month12              -7.052e+00  8.498e-01  -8.298  < 2e-16 ***
## weekdayMon           -1.332e+00  4.925e-01  -2.704 0.006846 ** 
## weekdayTue           -1.195e+00  4.886e-01  -2.445 0.014492 *  
## weekdayWed           -6.115e-01  4.881e-01  -1.253 0.210237    
## weekdayThu           -1.655e-01  4.879e-01  -0.339 0.734445    
## weekdayFri            5.652e-01  4.874e-01   1.160 0.246200    
## weekdaySat            4.511e-01  4.872e-01   0.926 0.354507    
## wd_avg               -1.638e-02  1.523e-03 -10.759  < 2e-16 ***
## ws_avg                3.331e-01  5.199e-02   6.407 1.49e-10 ***
## I(1/dist_wrp^2)       8.692e-06  8.182e-06   1.062 0.288079    
## I(1/dist_ref^2)      -6.508e-03  8.856e-04  -7.349 2.00e-13 ***
## I(1/dist_dc^2)        5.031e-01  6.269e-02   8.025 1.02e-15 ***
## monthly_oil_2km       6.642e-04  1.872e-04   3.548 0.000388 ***
## monthly_gas_2km       5.620e-06  3.777e-04   0.015 0.988130    
## active_2km            5.131e-01  8.860e-02   5.791 7.00e-09 ***
## inactive_2km          5.517e+00  2.490e-01  22.156  < 2e-16 ***
## elevation            -2.927e+00  1.127e-01 -25.966  < 2e-16 ***
## EVI                  -1.163e+02  4.656e+00 -24.981  < 2e-16 ***
## num_odor_complaints   1.270e+00  2.143e-02  59.263  < 2e-16 ***
## closest_wrp_capacity  8.840e-01  3.090e-02  28.605  < 2e-16 ***
## hourly_downwind_ref  -4.312e+00  4.602e-01  -9.369  < 2e-16 ***
## hourly_downwind_wrp   2.998e+00  5.164e-01   5.806 6.42e-09 ***
## hourly_temp          -4.508e-01  2.725e-02 -16.542  < 2e-16 ***
## hourly_hum           -1.120e-01  9.152e-03 -12.233  < 2e-16 ***
## disaster              1.667e+01  1.198e+00  13.919  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -5.224e-09      2  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000e+00      9 90.65
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.719e+01     80 82.80
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 130/133
## R-sq.(adj) =  0.0508   Deviance explained = 5.11%
## GCV =   6255  Scale est. = 6253      n = 367838

Everything w.o Disaster Indicator

# Everything
summary(hm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.183e+02  9.133e+01  -3.486 0.000491 ***
## month02              -2.781e-01  5.928e+01  -0.005 0.996257    
## month03              -9.187e-01  1.068e+02  -0.009 0.993136    
## month04              -7.947e-01  1.426e+02  -0.006 0.995552    
## month05              -1.268e+00  1.665e+02  -0.008 0.993926    
## month06               3.780e-01  1.786e+02   0.002 0.998311    
## month07               1.063e+00  1.786e+02   0.006 0.995251    
## month08               2.511e+00  1.665e+02   0.015 0.987968    
## month09               3.783e+00  1.426e+02   0.027 0.978831    
## month10               1.635e+01  1.068e+02   0.153 0.878285    
## month11              -1.235e+00  5.928e+01  -0.021 0.983383    
## month12              -1.170e+00  7.385e-01  -1.584 0.113133    
## weekdayMon           -1.328e+00  4.927e-01  -2.696 0.007023 ** 
## weekdayTue           -1.183e+00  4.888e-01  -2.420 0.015536 *  
## weekdayWed           -6.132e-01  4.882e-01  -1.256 0.209132    
## weekdayThu           -1.974e-01  4.881e-01  -0.404 0.685861    
## weekdayFri            5.322e-01  4.875e-01   1.092 0.274972    
## weekdaySat            4.249e-01  4.873e-01   0.872 0.383216    
## wd_avg               -1.649e-02  1.525e-03 -10.812  < 2e-16 ***
## ws_avg                3.342e-01  5.246e-02   6.370 1.89e-10 ***
## I(1/dist_wrp^2)       1.135e-05  8.398e-06   1.352 0.176470    
## I(1/dist_ref^2)      -6.539e-03  8.889e-04  -7.356 1.89e-13 ***
## I(1/dist_dc^2)        4.948e-01  6.276e-02   7.884 3.18e-15 ***
## monthly_oil_2km       7.443e-04  1.879e-04   3.961 7.46e-05 ***
## monthly_gas_2km       2.943e-05  3.845e-04   0.077 0.938982    
## active_2km            4.500e-01  8.862e-02   5.078 3.82e-07 ***
## inactive_2km          5.624e+00  2.494e-01  22.547  < 2e-16 ***
## elevation            -2.921e+00  1.129e-01 -25.876  < 2e-16 ***
## EVI                  -1.157e+02  4.659e+00 -24.838  < 2e-16 ***
## num_odor_complaints   1.287e+00  2.140e-02  60.123  < 2e-16 ***
## closest_wrp_capacity  8.927e-01  3.096e-02  28.832  < 2e-16 ***
## hourly_downwind_ref  -4.414e+00  4.605e-01  -9.585  < 2e-16 ***
## hourly_downwind_wrp   3.074e+00  5.166e-01   5.951 2.66e-09 ***
## hourly_temp          -4.566e-01  2.730e-02 -16.726  < 2e-16 ***
## hourly_hum           -1.164e-01  9.231e-03 -12.609  < 2e-16 ***
## hourly_precip        -3.859e+00  1.063e+01  -0.363 0.716519    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df     F
## s(as.numeric(month))                                    -1.072e-09      2  0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000e+00      9 91.60
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.752e+01     80 80.74
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 130/133
## R-sq.(adj) =  0.0503   Deviance explained = 5.06%
## GCV = 6258.3  Scale est. = 6256.3    n = 367838

Log Hourly Max

Since February 2022

# Since feb 2022
summary(log_hm_sincefeb2022_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + hourly_downwind_ref + 
##     hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.311e+00  3.070e-01  10.782  < 2e-16 ***
## month02              4.794e-02  1.802e-01   0.266 0.790211    
## month03              2.318e-01  3.241e-01   0.715 0.474487    
## month04              3.997e-01  4.325e-01   0.924 0.355314    
## month05              2.830e-01  5.050e-01   0.560 0.575158    
## month06              1.408e-01  5.414e-01   0.260 0.794748    
## month07              1.918e-03  5.413e-01   0.004 0.997173    
## month08              4.657e-03  5.047e-01   0.009 0.992638    
## month09              1.255e-01  4.321e-01   0.290 0.771517    
## month10             -4.471e-02  3.241e-01  -0.138 0.890269    
## month11             -1.357e-01  1.806e-01  -0.751 0.452472    
## month12             -2.053e-01  1.400e-02 -14.671  < 2e-16 ***
## weekdayMon           1.154e-01  6.146e-03  18.768  < 2e-16 ***
## weekdayTue           1.813e-01  6.092e-03  29.755  < 2e-16 ***
## weekdayWed           1.742e-01  6.107e-03  28.521  < 2e-16 ***
## weekdayThu           1.290e-01  6.110e-03  21.111  < 2e-16 ***
## weekdayFri           1.431e-01  6.088e-03  23.502  < 2e-16 ***
## weekdaySat           8.866e-02  6.071e-03  14.603  < 2e-16 ***
## wd_avg              -4.300e-04  1.953e-05 -22.020  < 2e-16 ***
## ws_avg              -6.787e-02  7.607e-04 -89.220  < 2e-16 ***
## I(1/dist_wrp^2)      7.169e-07  7.728e-08   9.277  < 2e-16 ***
## I(1/dist_ref^2)     -2.919e-07  2.118e-06  -0.138 0.890390    
## I(1/dist_dc^2)      -5.394e-04  2.691e-05 -20.042  < 2e-16 ***
## monthly_oil_2km     -3.318e-06  2.403e-06  -1.381 0.167333    
## monthly_gas_2km     -3.821e-05  6.994e-06  -5.463 4.69e-08 ***
## active_2km           2.169e-02  1.925e-03  11.270  < 2e-16 ***
## inactive_2km        -1.578e-02  4.027e-03  -3.917 8.96e-05 ***
## elevation           -3.947e-02  3.147e-03 -12.543  < 2e-16 ***
## EVI                 -2.344e+00  7.236e-02 -32.393  < 2e-16 ***
## num_odor_complaints  1.751e-02  7.962e-04  21.997  < 2e-16 ***
## hourly_downwind_ref -6.058e-03  5.971e-03  -1.015 0.310298    
## hourly_downwind_wrp  7.632e-02  6.900e-03  11.060  < 2e-16 ***
## hourly_temp         -3.136e-02  3.665e-04 -85.567  < 2e-16 ***
## hourly_hum          -1.017e-02  1.303e-04 -78.072  < 2e-16 ***
## hourly_precip        3.703e-01  1.113e-01   3.328 0.000876 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    2.618e-11      0   Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  9.000e+00      9 227.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01     80 896.8
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/132
## R-sq.(adj) =  0.496   Deviance explained = 49.7%
## GCV = 0.40612  Scale est. = 0.4058    n = 153718

Disaster Only

# Disaster only
summary(log_hm_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + 
##     I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km + 
##     inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.300e+01  2.000e+00 -11.498  < 2e-16 ***
## month11               1.587e-01  3.764e-02   4.216 2.50e-05 ***
## month12              -7.281e-02  3.887e-02  -1.873 0.061078 .  
## weekdayMon            5.560e-02  1.941e-02   2.864 0.004188 ** 
## weekdayTue           -7.745e-02  1.948e-02  -3.976 7.04e-05 ***
## weekdayWed            1.200e-01  1.923e-02   6.242 4.37e-10 ***
## weekdayThu            1.086e-01  1.940e-02   5.597 2.20e-08 ***
## weekdayFri            3.229e-02  1.879e-02   1.719 0.085638 .  
## weekdaySat           -7.210e-02  1.900e-02  -3.795 0.000148 ***
## wd_avg               -7.589e-04  5.254e-05 -14.443  < 2e-16 ***
## ws_avg               -1.126e-01  2.863e-03 -39.351  < 2e-16 ***
## I(1/dist_wrp^2)      -4.404e-05  3.692e-06 -11.929  < 2e-16 ***
## I(1/dist_ref^2)      -2.540e-04  2.117e-05 -11.999  < 2e-16 ***
## I(1/dist_dc^2)        3.175e-01  2.604e-02  12.191  < 2e-16 ***
## monthly_oil_2km       6.092e-04  4.319e-05  14.105  < 2e-16 ***
## active_2km           -4.152e-01  1.346e-02 -30.847  < 2e-16 ***
## inactive_2km          7.570e-01  4.627e-02  16.360  < 2e-16 ***
## elevation            -2.215e-01  1.232e-02 -17.975  < 2e-16 ***
## num_odor_complaints   4.079e-03  4.842e-04   8.424  < 2e-16 ***
## closest_wrp_capacity  6.850e-02  3.809e-03  17.985  < 2e-16 ***
## hourly_downwind_ref  -1.588e-01  1.574e-02 -10.090  < 2e-16 ***
## hourly_downwind_wrp   2.311e-01  2.005e-02  11.528  < 2e-16 ***
## hourly_temp          -3.327e-02  1.039e-03 -32.031  < 2e-16 ***
## hourly_hum           -9.715e-03  3.157e-04 -30.775  < 2e-16 ***
## hourly_precip         1.019e+00  2.722e-01   3.744 0.000181 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 238.98
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.36     80  89.27
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/114
## R-sq.(adj) =   0.56   Deviance explained = 56.2%
## GCV = 0.77762  Scale est. = 0.77478   n = 30242

Exclude Disaster

# Exclude disaster
summary(log_hm_excl_dis_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           4.664e+00  2.381e-01   19.586  < 2e-16 ***
## month02              -3.220e-01  1.211e-01   -2.658 0.007854 ** 
## month03              -3.962e-01  2.180e-01   -1.817 0.069213 .  
## month04              -2.329e-01  2.910e-01   -0.800 0.423594    
## month05              -1.459e-01  3.400e-01   -0.429 0.667782    
## month06              -1.049e-01  3.646e-01   -0.288 0.773596    
## month07               9.324e-02  3.646e-01    0.256 0.798137    
## month08               3.447e-01  3.400e-01    1.014 0.310661    
## month09               3.688e-01  2.911e-01    1.267 0.205198    
## month10               4.764e-02  2.182e-01    0.218 0.827147    
## month11               2.540e-02  1.213e-01    0.209 0.834128    
## month12               2.105e-02  7.852e-03    2.681 0.007343 ** 
## weekdayMon            1.195e-01  4.498e-03   26.560  < 2e-16 ***
## weekdayTue            1.722e-01  4.461e-03   38.609  < 2e-16 ***
## weekdayWed            1.947e-01  4.457e-03   43.692  < 2e-16 ***
## weekdayThu            1.762e-01  4.456e-03   39.530  < 2e-16 ***
## weekdayFri            1.641e-01  4.458e-03   36.824  < 2e-16 ***
## weekdaySat            8.189e-02  4.449e-03   18.406  < 2e-16 ***
## wd_avg               -6.385e-04  1.414e-05  -45.147  < 2e-16 ***
## ws_avg               -4.274e-02  4.729e-04  -90.360  < 2e-16 ***
## I(1/dist_wrp^2)       1.673e-06  1.266e-07   13.216  < 2e-16 ***
## I(1/dist_ref^2)      -6.065e-05  5.807e-06  -10.444  < 2e-16 ***
## I(1/dist_dc^2)        6.920e-04  2.075e-04    3.335 0.000852 ***
## monthly_oil_2km      -6.836e-06  1.841e-06   -3.714 0.000204 ***
## monthly_gas_2km      -5.265e-05  3.598e-06  -14.633  < 2e-16 ***
## active_2km            2.082e-02  8.355e-04   24.920  < 2e-16 ***
## inactive_2km         -3.070e-02  2.436e-03  -12.600  < 2e-16 ***
## elevation             8.180e-04  1.025e-03    0.798 0.424883    
## EVI                  -2.091e+00  4.337e-02  -48.211  < 2e-16 ***
## num_odor_complaints   4.475e-03  3.470e-04   12.896  < 2e-16 ***
## closest_wrp_capacity -4.812e-03  2.976e-04  -16.166  < 2e-16 ***
## hourly_downwind_ref  -3.486e-02  4.275e-03   -8.155 3.51e-16 ***
## hourly_downwind_wrp   8.342e-02  4.729e-03   17.641  < 2e-16 ***
## hourly_temp          -2.969e-02  2.585e-04 -114.847  < 2e-16 ***
## hourly_hum           -8.471e-03  8.876e-05  -95.440  < 2e-16 ***
## hourly_precip        -6.580e-01  1.056e-01   -6.233 4.59e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df    F
## s(as.numeric(month))                                    4.462e-10      1    0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  8.999e+00      9 1103
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01     80 1352
##                                                         p-value    
## s(as.numeric(month))                                      0.641    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 130/133
## R-sq.(adj) =   0.41   Deviance explained =   41%
## GCV = 0.47893  Scale est. = 0.47876   n = 337596

Everything w Disaster Indicator

# Disaster indicator
summary(log_hm_dis_ind_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip + disaster
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.223e+00  1.391e-01   15.976  < 2e-16 ***
## month02              -3.453e-01  2.860e-02  -12.076  < 2e-16 ***
## month03              -4.884e-01  5.069e-02   -9.634  < 2e-16 ***
## month04              -3.897e-01  6.752e-02   -5.772 7.82e-09 ***
## month05              -3.310e-01  7.875e-02   -4.204 2.63e-05 ***
## month06              -2.829e-01  8.434e-02   -3.355 0.000795 ***
## month07              -6.864e-02  8.421e-02   -0.815 0.415043    
## month08               2.333e-01  7.851e-02    2.972 0.002961 ** 
## month09               3.135e-01  6.733e-02    4.655 3.24e-06 ***
## month10               1.756e-01  5.094e-02    3.447 0.000566 ***
## month11              -9.293e-04  2.917e-02   -0.032 0.974588    
## month12              -1.378e-01  8.008e-03  -17.210  < 2e-16 ***
## weekdayMon            1.077e-01  4.628e-03   23.262  < 2e-16 ***
## weekdayTue            1.450e-01  4.592e-03   31.580  < 2e-16 ***
## weekdayWed            1.853e-01  4.586e-03   40.400  < 2e-16 ***
## weekdayThu            1.656e-01  4.586e-03   36.119  < 2e-16 ***
## weekdayFri            1.501e-01  4.580e-03   32.779  < 2e-16 ***
## weekdaySat            6.817e-02  4.578e-03   14.891  < 2e-16 ***
## wd_avg               -7.101e-04  1.433e-05  -49.557  < 2e-16 ***
## ws_avg               -4.462e-02  4.930e-04  -90.502  < 2e-16 ***
## I(1/dist_wrp^2)       7.858e-07  1.094e-07    7.180 6.99e-13 ***
## I(1/dist_ref^2)      -7.458e-05  5.143e-06  -14.502  < 2e-16 ***
## I(1/dist_dc^2)        6.322e-03  2.534e-04   24.949  < 2e-16 ***
## monthly_oil_2km      -1.097e-05  1.796e-06   -6.106 1.03e-09 ***
## monthly_gas_2km      -9.860e-06  3.989e-06   -2.472 0.013439 *  
## active_2km            2.362e-02  8.525e-04   27.711  < 2e-16 ***
## inactive_2km          7.026e-03  2.458e-03    2.858 0.004263 ** 
## elevation            -2.521e-02  1.064e-03  -23.692  < 2e-16 ***
## EVI                  -2.889e+00  4.382e-02  -65.915  < 2e-16 ***
## num_odor_complaints   1.345e-02  2.015e-04   66.764  < 2e-16 ***
## closest_wrp_capacity  1.569e-03  2.938e-04    5.341 9.23e-08 ***
## hourly_downwind_ref  -6.705e-02  4.327e-03  -15.497  < 2e-16 ***
## hourly_downwind_wrp   1.039e-01  4.854e-03   21.407  < 2e-16 ***
## hourly_temp          -3.271e-02  2.565e-04 -127.507  < 2e-16 ***
## hourly_hum           -9.063e-03  8.679e-05 -104.426  < 2e-16 ***
## hourly_precip        -8.261e-01  9.986e-02   -8.273  < 2e-16 ***
## disaster              5.243e-01  1.127e-02   46.530  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                                edf Ref.df
## s(as.numeric(month))                                    -1.493e-11  0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.974e+00  8.999
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  7.992e+01 80.000
##                                                              F p-value    
## s(as.numeric(month))                                       Inf  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   474.2  <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1297.5  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 131/134
## R-sq.(adj) =  0.412   Deviance explained = 41.3%
## GCV = 0.5523  Scale est. = 0.55212   n = 367838

Everything w.o Disaster Indicator

# Everything
summary(log_hm_full_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + month + weekday + wd_avg + 
##     ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + 
##     monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km + 
##     elevation + EVI + num_odor_complaints + closest_wrp_capacity + 
##     hourly_downwind_ref + hourly_downwind_wrp + hourly_temp + 
##     hourly_hum + hourly_precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.285e+00  2.731e-01    8.365  < 2e-16 ***
## month02              -1.936e-01  7.609e-01   -0.254   0.7992    
## month03              -1.795e-01  9.584e-01   -0.187   0.8514    
## month04               2.064e-02  7.835e-01    0.026   0.9790    
## month05               9.684e-02  6.327e-01    0.153   0.8784    
## month06               8.011e-02  6.945e-01    0.115   0.9082    
## month07               1.716e-01  6.937e-01    0.247   0.8046    
## month08               3.306e-01  6.314e-01    0.524   0.6006    
## month09               2.860e-01  7.831e-01    0.365   0.7149    
## month10               2.670e-01  9.584e-01    0.279   0.7806    
## month11               9.732e-02  7.609e-01    0.128   0.8982    
## month12               4.738e-02  6.970e-03    6.798 1.07e-11 ***
## weekdayMon            1.077e-01  4.642e-03   23.201  < 2e-16 ***
## weekdayTue            1.453e-01  4.605e-03   31.551  < 2e-16 ***
## weekdayWed            1.852e-01  4.600e-03   40.259  < 2e-16 ***
## weekdayThu            1.644e-01  4.599e-03   35.754  < 2e-16 ***
## weekdayFri            1.490e-01  4.593e-03   32.436  < 2e-16 ***
## weekdaySat            6.734e-02  4.591e-03   14.667  < 2e-16 ***
## wd_avg               -7.118e-04  1.437e-05  -49.531  < 2e-16 ***
## ws_avg               -4.471e-02  4.944e-04  -90.420  < 2e-16 ***
## I(1/dist_wrp^2)       1.040e-06  1.089e-07    9.546  < 2e-16 ***
## I(1/dist_ref^2)      -8.449e-05  5.591e-06  -15.112  < 2e-16 ***
## I(1/dist_dc^2)        6.098e-03  2.898e-04   21.041  < 2e-16 ***
## monthly_oil_2km      -8.581e-06  1.801e-06   -4.764 1.90e-06 ***
## monthly_gas_2km      -9.137e-06  4.005e-06   -2.281   0.0225 *  
## active_2km            2.129e-02  8.545e-04   24.909  < 2e-16 ***
## inactive_2km          1.214e-02  2.470e-03    4.913 8.97e-07 ***
## elevation            -2.497e-02  1.067e-03  -23.389  < 2e-16 ***
## EVI                  -2.869e+00  4.397e-02  -65.257  < 2e-16 ***
## num_odor_complaints   1.399e-02  2.017e-04   69.367  < 2e-16 ***
## closest_wrp_capacity  1.921e-03  2.949e-04    6.516 7.25e-11 ***
## hourly_downwind_ref  -7.015e-02  4.339e-03  -16.168  < 2e-16 ***
## hourly_downwind_wrp   1.062e-01  4.868e-03   21.813  < 2e-16 ***
## hourly_temp          -3.290e-02  2.572e-04 -127.912  < 2e-16 ***
## hourly_hum           -9.221e-03  8.698e-05 -106.018  < 2e-16 ***
## hourly_precip        -7.941e-01  1.002e-01   -7.929 2.22e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df        F
## s(as.numeric(month))                                     1.354      4    0.004
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.983      9  478.832
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.974     80 1272.465
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 128/133
## R-sq.(adj) =  0.409   Deviance explained = 40.9%
## GCV = 0.55555  Scale est. = 0.55536   n = 367838

Helper Function

adj_r2 <- function(r2, n, p){
  # n-p since p here includes intercept
  return(1 - (1-r2)*(n - 1)/(n - p))
}

get_bt_adj_r2 <- function(name, response, daterange) {
  data <- get_data(response, daterange)
  model <- get(paste0(name,'_', daterange, '_gam'))
  response_colname <- names(model$model)[1]
  response_colname <- str_sub(response_colname, 5, -2)
  predictions <- predict(model, newdata = data) 
  bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
  bt_adj_r2 <- adj_r2(bt_R2, summary(model)$n, summary(model)$np)
  return(bt_adj_r2)
}

GAM result

# comp r2
data <- get_data('log(H2S_daily_avg)', 'dis_ind')
model <- get(paste0('log_da','_', 'dis_ind', '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data) 
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))

print('R2 computed using R2() function from caret package')
## [1] "R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname) %>% log(), predictions)
## [1] 0.5544955
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname) %>% log(), predictions),
       summary(model)$n, summary(model)$np)
## [1] 0.5508375
print('BT-R2 computed using R2() function from caret package')
## [1] "BT-R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname), exp(predictions))
## [1] 0.0004193897
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname), exp(predictions)),
       summary(model)$n, summary(model)$np)
## [1] -0.007788197
print('Returned by get_bt_adj_r2')
## [1] "Returned by get_bt_adj_r2"
get_bt_adj_r2('log_da', 'log(H2S_daily_avg)', 'dis_ind')
## [1] -0.007788197
print('Fit obs(y) ~ exp(predicted y)')
## [1] "Fit obs(y) ~ exp(predicted y)"
model_r2 <- lm(data %>% pull(response_colname) ~ exp(predictions))
summary(model_r2)$r.sq
## [1] 0.0004193897
summary(model_r2)$df[1] + summary(model_r2)$df[2]
## [1] 15595
summary(model_r2)$df[1]
## [1] 2
summary(model_r2)$adj.r.squared
## [1] 0.0003552853
adj_r2(summary(model_r2)$r.sq,
       summary(model_r2)$df[1] + summary(model_r2)$df[2],
       summary(model_r2)$df[1])
## [1] 0.0003552853
date_names <- c('Since Feb 2022', 'Disaster Only', 'Exclude Disaster',
                'Everything w. Disaster Indicator', 
                'Everything w.o Disaster Indicator')
response_disp_names <- c('Daily Avg', 'Log Daily Avg', 'Daily Max', 'Log Daily Max',
                    'Hourly Avg', 'Log Hourly Avg', 'Hourly Max', 'Log Hourly Max')

gam_result_table <- expand.grid(date_names, response_disp_names) %>%
  setNames(c('date_names', 'response_disp_names'))

date_name_conversion <- tibble(date_names = date_names,
                               daterange = dateranges)

response_name_conversion <- tibble(response_disp_names = unique(gam_result_table$response_disp_names),
                                   response_obj_name = c(daily_responses, hourly_responses),
                                   model_response_name = response_names,
                                   transformation = rep(c('', 'Log'), 4))

gam_result_table <- gam_result_table %>%
  left_join(date_name_conversion) %>%
  left_join(response_name_conversion)
## Joining with `by = join_by(date_names)`
## Joining with `by = join_by(response_disp_names)`
gam_result_table <- gam_result_table %>%
  mutate(adjr2 = NA, 
         bt_adjr2 = NA,
         n = NA,
         p = NA)

for (i in 1:nrow(gam_result_table)) {
  name <- gam_result_table$model_response_name[i]
  response <- gam_result_table$response_obj_name[i]
  daterange <- gam_result_table$daterange[i]
  model <- get(paste0(name,'_', daterange, '_gam'))
  if (str_detect(response, 'log\\(')) {
    bt_adjr2 <- get_bt_adj_r2(name, response, daterange)
  } else {
    bt_adjr2 <- NA
  }
  
  adjr2 <- summary(model)$r.sq
  n <- summary(model)$n
  p <- summary(model)$np
  
  new_columns <- tibble(adjr2 = adjr2, bt_adjr2 = bt_adjr2, n = n, p = p)
  new_row <- bind_cols(gam_result_table[i, 1:6], new_columns)
  gam_result_table[i, ] <- new_row
  print(str_glue('Completed {i} iterations'))
}
## Completed 1 iterations
## Completed 2 iterations
## Completed 3 iterations
## Completed 4 iterations
## Completed 5 iterations
## Completed 6 iterations
## Completed 7 iterations
## Completed 8 iterations
## Completed 9 iterations
## Completed 10 iterations
## Completed 11 iterations
## Completed 12 iterations
## Completed 13 iterations
## Completed 14 iterations
## Completed 15 iterations
## Completed 16 iterations
## Completed 17 iterations
## Completed 18 iterations
## Completed 19 iterations
## Completed 20 iterations
## Completed 21 iterations
## Completed 22 iterations
## Completed 23 iterations
## Completed 24 iterations
## Completed 25 iterations
## Completed 26 iterations
## Completed 27 iterations
## Completed 28 iterations
## Completed 29 iterations
## Completed 30 iterations
## Completed 31 iterations
## Completed 32 iterations
## Completed 33 iterations
## Completed 34 iterations
## Completed 35 iterations
## Completed 36 iterations
## Completed 37 iterations
## Completed 38 iterations
## Completed 39 iterations
## Completed 40 iterations
temp <- rep(rep(response_disp_names[!str_detect(response_disp_names, 'Log')], each =2), 5)

base_table <- gam_result_table %>%
  arrange(factor(date_names, levels = .env$date_names)) %>%
  mutate(response_base = temp,
       `bt_adjr2` = '') %>%
  filter(transformation == '') %>%
  select(all_of(c('date_names', 'response_base', 'model_response_name', 'adjr2', 'bt_adjr2', 'n', 'p'))) %>%
  select(-bt_adjr2)

log_table <- gam_result_table %>%
  arrange(factor(date_names, levels = .env$date_names)) %>%
  mutate(response_base = temp) %>%
  filter(transformation == 'Log') %>%
  select(all_of(c('date_names', 'response_base', 'model_response_name','adjr2', 'bt_adjr2', 'n', 'p')))

gam_result_table_fordisp <- base_table %>%
  left_join(log_table, join_by(date_names, response_base)) %>%
  select(-'date_names', -starts_with('model_response_name')) %>%
  setNames(c('Response', c('Adj.R2', 'N', 'P'), c('Adj.R2', 'BT-Adj.R2', 'N', 'P')))

gam_result_table_kable <- gam_result_table_fordisp %>%
  knitr::kable(format = 'latex', digits = 2) %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))

writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')

gam_result_table_fordisp %>%
  knitr::kable(format = 'html', digits = 2, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
No Transformation
Log-Transformation
Response Adj.R2 N P Adj.R2 BT-Adj.R2 N P
Since Feb 2022
Daily Avg 0.62 6531 127 0.70 0.62 6531 126
Daily Max 0.21 6531 126 0.54 0.22 6531 125
Hourly Avg 0.35 153718 133 0.55 0.34 153718 132
Hourly Max 0.17 153718 132 0.50 0.16 153718 132
Disaster Only
Daily Avg 0.42 1273 105 0.75 0.55 1273 111
Daily Max 0.43 1273 99 0.72 0.44 1273 110
Hourly Avg 0.21 30242 114 0.58 0.22 30242 114
Hourly Max 0.22 30242 114 0.56 0.24 30242 114
Exclude Disaster
Daily Avg 0.46 14322 128 0.57 0.46 14322 129
Daily Max 0.12 14322 123 0.48 0.11 14322 128
Hourly Avg 0.25 337596 132 0.44 0.25 337596 133
Hourly Max 0.15 337596 131 0.41 0.14 337596 133
Everything w D.I
Daily Avg 0.11 15595 123 0.55 -0.01 15595 128
Daily Max 0.12 15595 123 0.50 -0.01 15595 128
Hourly Avg 0.05 367838 134 0.44 0.00 367838 133
Hourly Max 0.05 367838 133 0.41 0.00 367838 134
Everything w.o D.I
Daily Avg 0.11 15595 122 0.54 -0.01 15595 127
Daily Max 0.12 15595 123 0.49 -0.01 15595 128
Hourly Avg 0.05 367838 133 0.43 0.00 367838 132
Hourly Max 0.05 367838 133 0.41 0.00 367838 133

XGBoost (Daily Average)

Since 2022 Feb

validation_result <- tibble(Model = character(),
                               model_response_name = character(),
                               daterange = character(),
                               'Coef' = character(),
                               'R-Sq' = numeric(),
                               'Disaster RMSE' = numeric(),
                               'Normal RMSE' = numeric())

xgb_result <- tibble(Model = character(),
                        model_response_name = character(),
                        daterange = character(),
                        'R-Sq' = numeric(),
                        'BT R-Sq' = numeric(),
                        'RMSE' = numeric(),
                        'BT RMSE' = numeric())

fit.xgb_da_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_sincefeb2022)
fit.xgb_da_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 700
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 141     700         6 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x", 
           "Latitude" = "mon_utm_y",
           "Distance to Refinery" = "dist_ref", 
           "Angle to Refinery" = "angle_ref",
           "Active Wells within 2km" = "active_2km", 
           "Inactive Wells within 2km" = "inactive_2km",
           "Monthly Oil Production 2km" = "monthly_oil_2km",
           "Monthly Gas Production 2km" = "monthly_gas_2km",
           "Distance to WRP" = "dist_wrp",
           "WRP Capacity" = "closest_wrp_capacity",
           "Angle to WRP" = "angle_wrp",
           "Distance to Dominguez Channel" = "dist_dc",
           "Average Daily Temperature" = "daily_temp",
           "Average Daily Humidity" = "daily_hum",
           "Daily Precipitation" = "daily_precip",
           "Average Daily Wind Speed" = "ws_avg",
           "Average Daily Wind Direction" = "wd_avg",
           "Downwind Refinery" = "daily_downwind_ref",
           "Downwind WRP" = "daily_downwind_wrp",
           "Elevation" = "elevation",
           "Enhanced Vegetation Index" = "EVI",
           "Number of Daily Odor Complaints" = "num_odor_complaints",
           "2020" = "year_2020",
           "2021" = "year_2021",
           "2022" = "year_2022",
           "2023" = "year_2023",
           "January" = "month_01",
           "February" = "month_02",
           "March" = "month_03",
           "April" = "month_04",
           "May" = "month_05",
           "June" = "month_06", 
           "July" = "month_07",
           "August" = "month_08",
           "September" = "month_09",
           "October" = "month_10",
           "November" = "month_11",
           "December" = "month_12",
           "Monday" = "weekday_Mon",
           "Tuesday" = "weekday_Tue",
           "Wednesday" = "weekday_Wed",
           "Thursday" = "weekday_Thu",
           "Friday" = "weekday_Fri",
           "Saturday" = "weekday_Sat",
           "Sunday" = "weekday_Sun",
           "Disaster" = "disaster")

imp<-varImp(fit.xgb_da_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_sincefeb2022$pred$obs, pred = fit.xgb_da_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                              tibble(Model = 'Since Feb 2022',
                                     model_response_name = 'da',
                                     daterange = 'sincefeb2022',
                                     'Coef' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                     'R-Sq' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_sincefeb2022$pred$pred))$r.squared,
                                     'Disaster RMSE' = NA,
                                     'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_sincefeb2022$trainingData),
                       fit.xgb_da_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                        model_response_name = 'da',
                        daterange = 'sincefeb2022',
                        'R-Sq' = test_adj_r2,
                        'BT R-Sq' = NA,
                        'RMSE' = test_rmse,
                        'BT RMSE' = NA))

Disaster

fit.xgb_da_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 868.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 300
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 121     300         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_da_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'da',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_da_dis$pred$obs ~
                                                        fit.xgb_da_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_dis$pred$obs ~
                                                        fit.xgb_da_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_dis$trainingData),
                       fit.xgb_da_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'da',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_da_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_da_excl_dis.rds')
getTrainPerf(fit.xgb_da_excl_dis)
fit.xgb_da_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 150     700         6 0.1  0.01             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_da_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_excl_dis$pred$obs, 
                                               pred = fit.xgb_da_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'da',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
                                                        fit.xgb_da_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
                                                        fit.xgb_da_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_excl_dis$trainingData), 
                       fit.xgb_da_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'da',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_da_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis_ind.rds')
getTrainPerf(fit.xgb_da_dis_ind)
fit.xgb_da_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 186     700         4 0.3 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_dis_ind$pred$obs, 
                           pred = fit.xgb_da_dis_ind$pred$pred,
                           disaster = fit.xgb_da_dis_ind$trainingData$disaster[fit.xgb_da_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_da_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'da',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
                                                        fit.xgb_da_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
                                                        fit.xgb_da_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_dis_ind$trainingData), 
                       fit.xgb_da_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'da',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_da_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 243     700         5 0.3 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_full$pred$obs, 
                           pred = fit.xgb_da_full$pred$pred,
                           disaster = if_else(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_da_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'da',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_full$trainingData),
                       fit.xgb_da_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'da',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Observed Vs. Predicted Plots 10-fold CV

ggarrange(xgb_sincefeb2022_obs_vs_pred_plot, 
          ggarrange(xgb_disaster_obs_vs_pred_plot, xgb_disaster_obs_vs_pred_plot_zoom,  
                    ncol = 2, labels = c("2", "3")),
          ggarrange(xgb_everything_obs_vs_pred_plot, xgb_everything_obs_vs_pred_plot_zoom, 
                    ncol = 2, labels = c("4", "5")), 
                    labels = c("1"),
                    nrow = 3)
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'

ggarrange(xgb_excl_dis_obs_vs_pred_plot,
          ggarrange(xgb_dis_ind_obs_vs_pred_plot, xgb_dis_ind_obs_vs_pred_plot_zoom,  
                    ncol = 2, labels = c("3", "4")), 
                    labels = c("1"),
                    nrow = 2)

knitr::kable(validation_result, digits = 3)
Model model_response_name daterange Coef R-Sq Disaster RMSE Normal RMSE
Since Feb 2022 da sincefeb2022 1.012 0.759 NA NA
Disaster Only da dis 1.172 0.642 NA NA
Exclude Disaster da excl_dis 1.004 0.673 NA NA
Everything w. Disaster Indicator da dis_ind 1.089 0.965 17.744 0.281
Everything w.o Disaster Indicator da full 1.049 0.989 9.919 0.137

XGBoost: log(H2S_daily_avg)

Since Feb 2022

fit.xgb_da_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_log_h2s_sincefeb2022)
fit.xgb_da_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 700
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123     700         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_da',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_da',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_da_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis)
fit.xgb_da_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 700
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 69     700         5 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_da',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_da_log_h2s_dis$trainingData), 
                       fit.xgb_da_log_h2s_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_dis$trainingData), 
                       fit.xgb_da_log_h2s_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_da',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_da_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_excl_dis)
fit.xgb_da_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123     700         6 0.1 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_da',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_da_log_h2s_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                      nrow(fit.xgb_da_log_h2s_excl_dis$trainingData), 
                      fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)


BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_excl_dis$trainingData), 
                       fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_da',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_da_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis_ind)
fit.xgb_da_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285     700         6 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_log_h2s_dis_ind$pred$obs, 
                           pred = fit.xgb_da_log_h2s_dis_ind$pred$pred,
                           disaster = fit.xgb_da_log_h2s_dis_ind$trainingData$disaster[fit.xgb_da_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_da',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_da_log_h2s_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_dis_ind$trainingData), 
                       fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_da_log_h2s_dis_ind$trainingData), 
                       fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_da',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_da_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_full.rds')
getTrainPerf(fit.xgb_da_log_h2s_full)
fit.xgb_da_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285     700         6 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_da_log_h2s_full$pred$obs, 
                           pred = fit.xgb_da_log_h2s_full$pred$pred,
                           disaster = if_else(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_full$pred$obs), 
                                                              pred = exp(fit.xgb_da_log_h2s_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_da',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
                                                        fit.xgb_da_log_h2s_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
                                                        fit.xgb_da_log_h2s_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_da_log_h2s_full$trainingData), 
                       fit.xgb_da_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_da_log_h2s_full$trainingData), 
                       fit.xgb_da_log_h2s_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_da',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Daily Max)

Since 2022 Feb

fit.xgb_dm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_sincefeb2022)
fit.xgb_dm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 300
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 151     300         6 0.1  0.01             0.75                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_sincefeb2022$pred$obs, pred = fit.xgb_dm_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'dm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_sincefeb2022$trainingData),
                       fit.xgb_dm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'dm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_dm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis.rds')
getTrainPerf(fit.xgb_dm_dis)
fit.xgb_dm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 459.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 300
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 43     300         4 0.1  0.01             0.75                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_dm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'dm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_dm_dis$pred$obs ~
                                                        fit.xgb_dm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_dis$pred$obs ~
                                                        fit.xgb_dm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_dis$trainingData),
                       fit.xgb_dm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'dm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_dm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_excl_dis.rds')
getTrainPerf(fit.xgb_dm_excl_dis)
fit.xgb_dm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 300
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 142     300         6 0.1  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_dm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_excl_dis$pred$obs, 
                                               pred = fit.xgb_dm_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'dm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
                                                        fit.xgb_dm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
                                                        fit.xgb_dm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_excl_dis$trainingData), 
                       fit.xgb_dm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'dm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_dm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis_ind.rds')
getTrainPerf(fit.xgb_dm_dis_ind)
fit.xgb_dm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 294     700         6 0.3 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_dis_ind$pred$obs, 
                           pred = fit.xgb_dm_dis_ind$pred$pred,
                           disaster = fit.xgb_dm_dis_ind$trainingData$disaster[fit.xgb_dm_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_dm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'dm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
                                                        fit.xgb_dm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
                                                        fit.xgb_dm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_dis_ind$trainingData), 
                       fit.xgb_dm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'dm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_dm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_full.rds')
getTrainPerf(fit.xgb_dm_full)
fit.xgb_dm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 231     700         5 0.3 0.001             0.75                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_full$pred$obs, 
                           pred = fit.xgb_dm_full$pred$pred,
                           disaster = if_else(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_dm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'dm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
train_adj_r2 <- adj_r2(getTrainPerf(fit.xgb_dm_full)$TrainRsquared,
                       nrow(fit.xgb_dm_full$trainingData),
                       fit.xgb_dm_full$finalModel$nfeatures)
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_dm_full$trainingData),
                       fit.xgb_dm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'dm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_daily_max)

Since Feb 2022

fit.xgb_dm_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_log_h2s_sincefeb2022)
fit.xgb_dm_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 300
## nfeatures : 41 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 133     300         6 0.1 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_dm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
                                                        fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData), 
                       fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_dm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_dm_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis)
fit.xgb_dm_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 793.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 500
## nfeatures : 31 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 32     500         4 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_dm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_dis$trainingData), 
                       fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_dis$trainingData), 
                       fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_dm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_dm_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_excl_dis)
fit.xgb_dm_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 500
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 158     500         6 0.1  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_dm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
                                                        fit.xgb_dm_log_h2s_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData), 
                       fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData), 
                       fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_dm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_dm_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis_ind)
fit.xgb_dm_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 700
## nfeatures : 44 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321     700         6 0.3  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_dis_ind$pred$obs, 
                           pred = fit.xgb_dm_log_h2s_dis_ind$pred$pred,
                           disaster = fit.xgb_dm_log_h2s_dis_ind$trainingData$disaster[fit.xgb_dm_log_h2s_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_dm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
                                                        fit.xgb_dm_log_h2s_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData), 
                       fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData), 
                       fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_dm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_dm_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_full.rds')
getTrainPerf(fit.xgb_dm_log_h2s_full)
fit.xgb_dm_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 700
## nfeatures : 43 
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321     700         6 0.3  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_full$pred$obs, 
                           pred = fit.xgb_dm_log_h2s_full$pred$pred,
                           disaster = if_else(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_full$pred$obs), 
                                                              pred = exp(fit.xgb_dm_log_h2s_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_dm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
                                                        fit.xgb_dm_log_h2s_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
                                                        fit.xgb_dm_log_h2s_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_dm_log_h2s_full$trainingData), 
                       fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_dm_log_h2s_full$trainingData), 
                       fit.xgb_dm_log_h2s_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_dm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Hourly Average)

Since 2022 Feb

fit.xgb_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_ha_sincefeb2022)
fit.xgb_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x", 
           "Latitude" = "mon_utm_y",
           "Distance to Refinery" = "dist_ref", 
           "Angle to Refinery" = "angle_ref",
           "Active Wells within 2km" = "active_2km", 
           "Inactive Wells within 2km" = "inactive_2km",
           "Monthly Oil Production 2km" = "monthly_oil_2km",
           "Monthly Gas Production 2km" = "monthly_gas_2km",
           "Distance to WRP" = "dist_wrp",
           "WRP Capacity" = "closest_wrp_capacity",
           "Angle to WRP" = "angle_wrp",
           "Distance to Dominguez Channel" = "dist_dc",
           "Hourly Temperature" = "hourly_temp",
           "Hourly Humidity" = "hourly_hum",
           "Hourly Precipitation" = "hourly_precip",
           "Hourly Wind Speed" = "ws_avg",
           "Hourly Wind Direction" = "wd_avg",
           "Downwind Refinery" = "hourly_downwind_ref",
           "Downwind WRP" = "hourly_downwind_wrp",
           "Elevation" = "elevation",
           "Enhanced Vegetation Index" = "EVI",
           "Number of Daily Odor Complaints" = "num_odor_complaints",
           "2020" = "year_2020",
           "2021" = "year_2021",
           "2022" = "year_2022",
           "2023" = "year_2023",
           "January" = "month_01",
           "February" = "month_02",
           "March" = "month_03",
           "April" = "month_04",
           "May" = "month_05",
           "June" = "month_06", 
           "July" = "month_07",
           "August" = "month_08",
           "September" = "month_09",
           "October" = "month_10",
           "November" = "month_11",
           "December" = "month_12",
           "Monday" = "weekday_Mon",
           "Tuesday" = "weekday_Tue",
           "Wednesday" = "weekday_Wed",
           "Thursday" = "weekday_Thu",
           "Friday" = "weekday_Fri",
           "Saturday" = "weekday_Sat",
           "Sunday" = "weekday_Sun",
           "Disaster" = "disaster")

imp<-varImp(fit.xgb_ha_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_ha_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_sincefeb2022$pred$obs, pred = fit.xgb_ha_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                              tibble(Model = 'Since Feb 2022',
                                     model_response_name = 'ha',
                                     daterange = 'sincefeb2022',
                                     'Coef' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_ha_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                     'R-Sq' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_ha_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_sincefeb2022$trainingData),
                       fit.xgb_ha_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'ha',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis.rds')
getTrainPerf(fit.xgb_ha_dis)
fit.xgb_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 84.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 50
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 51      50         4 0.4 0.001              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_ha_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'ha',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_ha_dis$pred$obs ~
                                                        fit.xgb_ha_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_dis$pred$obs ~
                                                        fit.xgb_ha_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_dis$trainingData),
                       fit.xgb_ha_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'ha',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_ha_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_excl_dis.rds')
getTrainPerf(fit.xgb_ha_excl_dis)
fit.xgb_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.1 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 52     150         4 0.4 0.001              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_ha_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_excl_dis$pred$obs, 
                                               pred = fit.xgb_ha_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'ha',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
                                                        fit.xgb_ha_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
                                                        fit.xgb_ha_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_ha_excl_dis$trainingData), 
                       fit.xgb_ha_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'ha',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis_ind.rds')
getTrainPerf(fit.xgb_ha_dis_ind)
fit.xgb_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62     150         4 0.4  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_ha_dis_ind$pred$obs, 
                           pred = fit.xgb_ha_dis_ind$pred$pred,
                           disaster = fit.xgb_ha_dis_ind$trainingData$disaster[fit.xgb_ha_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_ha_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'ha',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
                                                        fit.xgb_ha_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
                                                        fit.xgb_ha_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_ha_dis_ind$trainingData), 
                       fit.xgb_ha_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'ha',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_full.rds')
getTrainPerf(fit.xgb_ha_full)
fit.xgb_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_ha_full$pred$obs, 
                           pred = fit.xgb_ha_full$pred$pred,
                           disaster = if_else(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_ha_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'ha',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_ha_full$trainingData),
                       fit.xgb_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'ha',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_hourly_avg)

Since Feb 2022

fit.xgb_log_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_ha_sincefeb2022)
fit.xgb_log_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_ha',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_ha_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_ha_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_sincefeb2022$trainingData), 
                       fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_sincefeb2022$trainingData), 
                       fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_ha',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_log_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis.rds')
getTrainPerf(fit.xgb_log_ha_dis)
fit.xgb_log_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 250.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 150
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_ha',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
                                                        fit.xgb_log_ha_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
                                                        fit.xgb_log_ha_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_dis$trainingData), 
                       fit.xgb_log_ha_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_dis$trainingData), 
                       fit.xgb_log_ha_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_ha',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_log_ha_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_excl_dis.rds')
getTrainPerf(fit.xgb_log_ha_excl_dis)
fit.xgb_log_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_ha',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
                                                        fit.xgb_log_ha_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
                                                        fit.xgb_log_ha_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_excl_dis$trainingData), 
                       fit.xgb_log_ha_excl_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_excl_dis$trainingData), 
                       fit.xgb_log_ha_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_ha',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_log_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis_ind.rds')
getTrainPerf(fit.xgb_log_ha_dis_ind)
fit.xgb_log_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_ha_dis_ind$pred$obs, 
                           pred = fit.xgb_log_ha_dis_ind$pred$pred,
                           disaster = fit.xgb_log_ha_dis_ind$trainingData$disaster[fit.xgb_log_ha_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_ha',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
                                                        fit.xgb_log_ha_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
                                                        fit.xgb_log_ha_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_dis_ind$trainingData), 
                       fit.xgb_log_ha_dis_ind$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_ha_dis_ind$trainingData), 
                       fit.xgb_log_ha_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_ha',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_log_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_full.rds')
getTrainPerf(fit.xgb_log_ha_full)
fit.xgb_log_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62     150         4 0.4  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_ha_full$pred$obs, 
                           pred = fit.xgb_log_ha_full$pred$pred,
                           disaster = if_else(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_full$pred$obs), 
                                                              pred = exp(fit.xgb_log_ha_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_ha',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
                                                        fit.xgb_log_ha_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
                                                        fit.xgb_log_ha_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_ha_full$trainingData), 
                       fit.xgb_log_ha_full$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_log_ha_full$trainingData), 
                       fit.xgb_log_ha_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_ha',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGBoost (Hourly Max)

Since 2022 Feb

fit.xgb_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_hm_sincefeb2022)
fit.xgb_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 30     150         4 0.2  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# we use savePredictions = 'final' to store the predictions on the test set at each fold

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_hm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_sincefeb2022$pred$obs, pred = fit.xgb_hm_sincefeb2022$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'hm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_hm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_hm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_sincefeb2022$trainingData),
                       fit.xgb_hm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'hm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Disaster

fit.xgb_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis.rds')
getTrainPerf(fit.xgb_hm_dis)
fit.xgb_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 86.2 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 50
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 29      50         4 0.2  0.01                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_hm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        # geom_abline(intercept = 0, slope = 1, color = 'red') +
                        # geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'hm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_hm_dis$pred$obs ~
                                                        fit.xgb_hm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_dis$pred$obs ~
                                                        fit.xgb_hm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_dis$trainingData),
                       fit.xgb_hm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'hm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Exclude Disaster

fit.xgb_hm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_excl_dis.rds')
getTrainPerf(fit.xgb_hm_excl_dis)
fit.xgb_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 60     150         4 0.4  0.01              0.8                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

fold_stat <- fit.xgb_hm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_excl_dis$pred$obs, 
                                               pred = fit.xgb_hm_excl_dis$pred$pred),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for exclude disaster XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_excl_dis_obs_vs_pred_plot

validation_result <- rbind(validation_result,  
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'hm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
                                                        fit.xgb_hm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
                                                        fit.xgb_hm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_hm_excl_dis$trainingData), 
                       fit.xgb_hm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'hm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w Disaster Indicator

fit.xgb_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis_ind.rds')
getTrainPerf(fit.xgb_hm_dis_ind)
fit.xgb_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_hm_dis_ind$pred$obs, 
                           pred = fit.xgb_hm_dis_ind$pred$pred,
                           disaster = fit.xgb_hm_dis_ind$trainingData$disaster[fit.xgb_hm_dis_ind$pred$rowIndex])

fold_stat <- fit.xgb_hm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        theme_bw()

xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs, 
                                                   pred = test_result_data$pred,
                                            disaster = test_result_data$disaster),
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'hm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
                                                        fit.xgb_hm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
                                                        fit.xgb_hm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2, 
                       nrow(fit.xgb_hm_dis_ind$trainingData), 
                       fit.xgb_hm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'hm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

Everything w.o Disaster Indicator

fit.xgb_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_full.rds')
getTrainPerf(fit.xgb_hm_full)
fit.xgb_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 248.7 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54     150         4 0.4 0.001                1                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_hm_full$pred$obs, 
                           pred = fit.xgb_hm_full$pred$pred,
                           disaster = if_else(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_10 == 1 |
                                                 fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_11 == 1 |
                                                 fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

fold_stat <- fit.xgb_hm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()

xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
                             aes(x = pred, y = obs)) +
                        geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq", "R2", "n"))) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
                        theme_bw()
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'hm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test_result_data$obs ~
                                                        test_result_data$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                         test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_hm_full$trainingData),
                       fit.xgb_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'hm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = NA,
                              'RMSE' = test_rmse,
                              'BT RMSE' = NA))

XGBoost: log(H2S_hourly_max)

Since Feb 2022

fit.xgb_log_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_hm_sincefeb2022)
fit.xgb_log_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41 
## niter: 150
## nfeatures : 41 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_sincefeb2022,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_sincefeb2022$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_sincefeb2022$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_sincefeb2022$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Since Februrary 2022') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  model_response_name = 'log_hm',
                                  daterange = 'sincefeb2022',
                                  'Coef' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_hm_sincefeb2022$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
                                                        fit.xgb_log_hm_sincefeb2022$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_sincefeb2022$trainingData), 
                       fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_sincefeb2022$trainingData), 
                       fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Since Feb 2022',
                              model_response_name = 'log_hm',
                              daterange = 'sincefeb2022',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Disaster Only

fit.xgb_log_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis.rds')
getTrainPerf(fit.xgb_log_hm_dis)
fit.xgb_log_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 150
## nfeatures : 31 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  model_response_name = 'log_hm',
                                  daterange = 'dis',
                                  'Coef' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
                                                        fit.xgb_log_hm_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
                                                        fit.xgb_log_hm_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_dis$trainingData), 
                       fit.xgb_log_hm_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_dis$trainingData), 
                       fit.xgb_log_hm_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Disaster Only',
                              model_response_name = 'log_hm',
                              daterange = 'dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Exclude Disaster

fit.xgb_log_hm_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_excl_dis.rds')
getTrainPerf(fit.xgb_log_hm_excl_dis)
fit.xgb_log_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_excl_dis,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_excl_dis$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_excl_dis$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_excl_dis$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Exclude Disaster') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +                                                stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Exclude Disaster',
                                  model_response_name = 'log_hm',
                                  daterange = 'excl_dis',
                                  'Coef' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
                                                        fit.xgb_log_hm_excl_dis$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
                                                        fit.xgb_log_hm_excl_dis$pred$pred))$r.squared,
                                  'Disaster RMSE' = NA,
                                  'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_excl_dis$trainingData), 
                       fit.xgb_log_hm_excl_dis$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_excl_dis$trainingData), 
                       fit.xgb_log_hm_excl_dis$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Exclude Disaster',
                              model_response_name = 'log_hm',
                              daterange = 'excl_dis',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w. Disaster Indicator

fit.xgb_log_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis_ind.rds')
getTrainPerf(fit.xgb_log_hm_dis_ind)
fit.xgb_log_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44 
## niter: 150
## nfeatures : 44 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56     150         4 0.4 0.001                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis_ind,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_hm_dis_ind$pred$obs, 
                           pred = fit.xgb_log_hm_dis_ind$pred$pred,
                           disaster = fit.xgb_log_hm_dis_ind$trainingData$disaster[fit.xgb_log_hm_dis_ind$pred$rowIndex])

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis_ind$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis_ind$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_dis_ind$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w. Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) + 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w. Disaster Indicator',
                                  model_response_name = 'log_hm',
                                  daterange = 'dis_ind',
                                  'Coef' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
                                                        fit.xgb_log_hm_dis_ind$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
                                                        fit.xgb_log_hm_dis_ind$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_dis_ind$trainingData), 
                       fit.xgb_log_hm_dis_ind$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt,
                       nrow(fit.xgb_log_hm_dis_ind$trainingData), 
                       fit.xgb_log_hm_dis_ind$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w. Disaster Indicator',
                              model_response_name = 'log_hm',
                              daterange = 'dis_ind',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

Everything w.o Disaster Indicator

fit.xgb_log_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_full.rds')
getTrainPerf(fit.xgb_log_hm_full)
fit.xgb_log_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43 
## niter: 150
## nfeatures : 43 
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64     150         4 0.4  0.01                1                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_full,scale=FALSE)

# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
    pivot_wider(names_from = variable, 
                values_from = importance) %>%
    rename(any_of(names)) %>%
    pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')

imp %>%
  top_n(15, importance) %>%
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_point() +
  geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
  ylab("importance") +
  xlab("Variable") +
  coord_flip() +
  theme_minimal()

test_result_data <- tibble(obs = fit.xgb_log_hm_full$pred$obs, 
                           pred = fit.xgb_log_hm_full$pred$pred,
                           disaster = if_else(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$year_2021 == 1 &
                                                (fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_10 == 1 |
                                                   fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_11 == 1 |
                                                   fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))

# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_full$pred %>% group_by(Resample) %>% 
  summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
            R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_full$pred$obs), 
                                                              pred = exp(fit.xgb_log_hm_full$pred$pred)),
                             aes(x = pred, y = obs)) +
                        geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +                                                         
                        geom_point() +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator') +
                        stat_poly_line() +
                        stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + 
                        stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +                                                 
                        stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
                        theme_bw()
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  model_response_name = 'log_hm',
                                  daterange = 'full',
                                  'Coef' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
                                                        fit.xgb_log_hm_full$pred$pred))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
                                                        fit.xgb_log_hm_full$pred$pred))$r.squared,
                                  'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
                                                         test_result_data$obs[which(test_result_data$disaster == 1)]),
                                  'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
                                                       test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
                       nrow(fit.xgb_log_hm_full$trainingData), 
                       fit.xgb_log_hm_full$finalModel$nfeatures)

BT_adj_r2 <- adj_r2(test_r2_bt, 
                       nrow(fit.xgb_log_hm_full$trainingData), 
                       fit.xgb_log_hm_full$finalModel$nfeatures)

xgb_result <- rbind(xgb_result, 
                       tibble(Model = 'Everything w.o Disaster Indicator',
                              model_response_name = 'log_hm',
                              daterange = 'full',
                              'R-Sq' = test_adj_r2,
                              'BT R-Sq' = BT_adj_r2,
                              'RMSE' = test_rmse,
                              'BT RMSE' = test_rmse_bt))

XGB Model performances

validation_result
xgb_result

GAM VS XGBoost

base_validation_result <- validation_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == '') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  arrange(factor(Model, levels = unique(validation_result$Model)))

log_validation_result <- validation_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == 'Log') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max'))

base_xgb_result <- xgb_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == '') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  select(-transformation, -`BT R-Sq`, -`BT RMSE`)

log_xgb_result <- xgb_result %>%
  mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
  filter(transformation == 'Log') %>%
  mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
                                   str_detect(model_response_name, 'dm') ~ 'Daily Max',
                                   str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
                                   str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
  select(-transformation)
# This is the result from regressing the observed on predicted
options(knitr.kable.NA = '')
validation_result_table <- base_validation_result %>% 
  left_join(log_validation_result, join_by(Model, response_base)) %>%
  select(all_of(c('response_base', 'Coef.x', 'R-Sq.x', 'Normal RMSE.x', 
                  'Disaster RMSE.x', 'Coef.y', 'R-Sq.y', 'Normal RMSE.y', 
                  'Disaster RMSE.y'))) %>%
  setNames(c('Response', 'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE', 
             'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE'))

validation_result_table %>%
  knitr::kable(format = 'pipe', digits = 3, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 1, 'No Transformation' = 4, 'Log-Transformation' = 4)) %>%
  kable_styling()
No Transformation
Log-Transformation
Response Coef R2 Normal RMSE Disaster RMSE Coef R2 Normal RMSE Disaster RMSE
Since Feb 2022
Daily Avg 1.012 0.759 1.019 0.814
Daily Max 0.743 0.207 0.992 0.614
Hourly Avg 1.011 0.533 1.028 0.698
Hourly Max 1.033 0.365 1.025 0.638
Disaster Only
Daily Avg 1.172 0.642 1.015 0.864
Daily Max 1.041 0.532 1.020 0.821
Hourly Avg 0.925 0.434 1.002 0.771
Hourly Max 0.894 0.485 1.000 0.750
Exclude Disaster
Daily Avg 1.004 0.673 1.025 0.792
Daily Max 0.793 0.137 0.984 0.608
Hourly Avg 1.033 0.403 1.047 0.653
Hourly Max 0.965 0.276 1.043 0.604
Everything w D.I
Daily Avg 1.089 0.965 0.281 17.744 1.009 0.977 0.121 0.135
Daily Max 0.997 0.998 1.047 27.285 1.008 0.961 0.202 0.205
Hourly Avg 1.013 0.940 1.686 32.007 1.043 0.680 0.496 0.669
Hourly Max 1.042 0.915 4.313 81.977 1.043 0.642 0.567 0.719
Everything w.o D.I
Daily Avg 1.049 0.989 0.137 9.919 1.011 0.975 0.121 0.198
Daily Max 0.990 0.994 5.095 50.451 1.010 0.958 0.212 0.197
Hourly Avg 1.030 0.958 2.041 26.420 1.045 0.679 0.496 0.674
Hourly Max 0.948 0.874 4.301 100.490 1.046 0.641 0.567 0.728
full_result_table_fordisp <- base_table %>%
  left_join(log_table %>% select(-n), join_by(date_names, response_base)) %>%
  left_join(base_xgb_result, join_by(date_names == Model, response_base)) %>%
  left_join(log_xgb_result, join_by(date_names == Model, response_base)) %>%
  select(-starts_with('model_response_name'), -starts_with('daterange'), -'date_names') %>%
  select(all_of(c('response_base', 'n', 'adjr2.x', 'p.x', 'adjr2.y', 'bt_adjr2', 'p.y', 'R-Sq.x', 'RMSE.x', 'R-Sq.y', 'RMSE.y', 'BT R-Sq', 'BT RMSE'))) %>%
  setNames(c('Response','N', 'R2', 'P', 'R2', 'BT R2', 'P',
             c('R2', 'RMSE'),  
             c('R2', 'RMSE', 'BT R2', 'BT RMSE')))
  

full_result_table_kable <- full_result_table_fordisp %>%
  knitr::kable(format = 'latex', digits = 2) %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
  add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
  add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6))

writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')

full_result_table_fordisp %>%
  knitr::kable(format = 'pipe', digits = 4, table.attr = "style='width:100%;'") %>%
  pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4, 
                      "Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
  add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
  add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
  add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6)) %>%
  kable_styling()
GAM
XGBoost
No Transformation
Log-Transformation
No Transformation
Log-Transformation
Test
Test
Response N R2 P R2 BT R2 P R2 RMSE R2 RMSE BT R2 BT RMSE
Since Feb 2022
Daily Avg 6531 0.6187 127 0.7001 0.6174 126 0.7614 0.2478 0.8127 0.3059 0.7637 0.2478
Daily Max 6531 0.2086 126 0.5438 0.2177 125 0.2879 3.1244 0.6123 0.5722 0.3557 2.9330
Hourly Avg 153718 0.3521 133 0.5480 0.3414 132 0.5345 0.4892 0.6975 0.4493 0.5042 0.5114
Hourly Max 153718 0.1748 132 0.4962 0.1611 132 0.3769 1.0293 0.6383 0.5400 0.2907 1.1047
Disaster Only
Daily Avg 1273 0.4170 105 0.7458 0.5459 111 0.5257 40.7997 0.8532 0.4274 0.8037 30.9226
Daily Max 1273 0.4320 99 0.7246 0.4399 110 0.7240 360.8105 0.8146 0.6128 0.7604 296.5671
Hourly Avg 30242 0.2097 114 0.5830 0.2170 114 0.4512 96.7972 0.7705 0.6075 0.4515 97.9899
Hourly Max 30242 0.2229 114 0.5601 0.2360 114 0.5473 196.2375 0.7492 0.6631 0.4520 213.3409
Exclude Disaster
Daily Avg 14322 0.4568 128 0.5671 0.4575 129 0.7043 0.3188 0.7915 0.3326 0.6951 0.3258
Daily Max 14322 0.1230 123 0.4810 0.1071 128 0.2722 3.4251 0.6073 0.5725 0.3417 3.2564
Hourly Avg 337596 0.2458 132 0.4425 0.2478 133 0.4896 0.6074 0.6526 0.4940 0.4561 0.6354
Hourly Max 337596 0.1484 131 0.4097 0.1404 133 0.3360 1.1044 0.6043 0.5672 0.2918 1.1416
Everything w D.I
Daily Avg 15595 0.1136 123 0.5511 -0.0078 128 0.9816 2.5865 0.9775 0.1219 0.9812 1.6211
Daily Max 15595 0.1235 123 0.5032 -0.0079 128 0.9829 5.5020 0.9609 0.2016 0.9980 4.3671
Hourly Avg 367838 0.0488 134 0.4381 0.0016 133 0.9322 8.6180 0.6801 0.5121 0.4724 29.6837
Hourly Max 367838 0.0508 133 0.4124 0.0006 134 0.9223 21.4359 0.6415 0.5812 0.4736 62.7187
Everything w.o D.I
Daily Avg 15595 0.1123 122 0.5436 -0.0078 127 0.9968 1.1273 0.9747 0.1289 0.9073 4.5043
Daily Max 15595 0.1223 123 0.4937 -0.0079 128 0.9797 8.7597 0.9574 0.2098 0.9966 7.8280
Hourly Avg 367838 0.0483 133 0.4348 0.0012 132 0.9491 7.5139 0.6790 0.5131 0.4320 29.1288
Hourly Max 367838 0.0503 133 0.4090 0.0004 133 0.8837 25.5470 0.6408 0.5819 0.4550 63.2933